In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd 'drive/My Drive/pseudo-class generation'

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

!nvidia-smi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/pseudo-class generation
Thu Feb 29 22:42:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |     

In [None]:
pip install helpers



# Imports

In [None]:
# License: BSD
# Author: Sasank Chilamkurthy

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

cudnn.benchmark = True
plt.ion()   # interactive mode

from helpers import makedir
from log import create_logger
from preprocess import mean, std, preprocess_input_function

# Parameters

In [None]:
# Models can be choosen from [resnet, alexnet, vgg, squeezenet, densenet, inception]

cls_type = 'M'  # Note!: Set this from {'B', 'M'} to load/process Benign, Malignant data separately
n_cluster = 8

selected_model = "50_tensor(0.8793, device=_cuda_0_, dtype=torch.float64).pt" ## Select trained model at epoch with best test/val accuracy as feature extractor
n_fold = '1'
model_name = "resnet"
model_dir = './saved_models/' + model_name + '_fold'+n_fold + '/'
makedir(model_dir)
filepath = model_dir + selected_model
PATH = filepath

target_test_accu = 0.80
img_size = 300  # 224
num_epochs = 30
num_classes = 2
BATCH_SIZE = 80
train_batch_size = BATCH_SIZE
test_batch_size = BATCH_SIZE
train_push_batch_size = BATCH_SIZE

# Model initialization

In [None]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
    if feature_extracting == False:
        for param in model.parameters():
            param.requires_grad = True

def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    if model_name == "resnet":
        """ Resnet18
        """
        model_ft = models.resnet18(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = img_size
        #model_ft.fc = nn.Identity()


    elif model_name == "alexnet":
        """ Alexnet
        """
        model_ft = models.alexnet(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = img_size

    elif model_name == "vgg":
        """ VGG11_bn
        """
        model_ft = models.vgg19(pretrained = use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = img_size

    elif model_name == "squeezenet":
        """ Squeezenet
        """
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = img_size

    elif model_name == "densenet":
        """ Densenet
        """
        model_ft = models.densenet121(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
        #model_ft.classifier = nn.Identity()
        input_size = img_size

    elif model_name == "inception":
        """ Inception v3
        Be careful, expects (299,299) sized images and has auxiliary output
        """
        model_ft = models.inception_v3(pretrained = use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        # Handle the auxilary net
        num_ftrs = model_ft.AuxLogits.fc.in_features
        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
        # Handle the primary net
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs,num_classes)
        # model_ft.fc = nn.Identity()
        # model_ft.AuxLogits.fc =  nn.Identity()
    else:
        print("Invalid model name, exiting...")
        exit()

    return model_ft

# Load data

In [None]:
n_fold = '1'
magnification = '40x'
#train_test = '/test'
fold = 'revised_fold' + n_fold + '_' + magnification
data_path = './dataset/' + fold + '/'
train_dir = data_path + 'train_push_balanced/train_augmented/'
test_dir = data_path + 'test/'
train_push_dir = data_path + 'train/'

log, logclose = create_logger(log_filename=os.path.join(model_dir, 'train.log'))

# load the data
normalize = transforms.Normalize(mean=mean,
                                 std=std)

# push set
train_push_dataset = datasets.ImageFolder(
    train_push_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))

# train set
train_dataset_0 = datasets.ImageFolder(
    train_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))


train_dataset = train_push_dataset + train_dataset_0

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size = None, shuffle=False,
    num_workers=4, pin_memory=False)

train_push_loader = torch.utils.data.DataLoader(
    train_push_dataset, batch_size = None, shuffle=False,
    num_workers=4, pin_memory=False)

train_loader_0 = torch.utils.data.DataLoader(
    train_dataset_0, batch_size = None, shuffle=True,
    num_workers=4, pin_memory=False)

# test set
test_dataset = datasets.ImageFolder(
    test_dir,
    transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.ToTensor(),
        normalize,
    ]))

test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size = None, shuffle=False,
    num_workers=4, pin_memory=False)

# we should look into distributed sampler more carefully at torch.utils.data.distributed.DistributedSampler(train_dataset)

log('push set size: {0}'.format(len(train_push_loader.dataset)))
log('training_0 set size: {0}'.format(len(train_loader_0.dataset)))
log('training set size: {0}'.format(len(train_loader.dataset)))
log('test set size: {0}'.format(len(test_loader.dataset)))
#log('batch size: {0}'.format(train_batch_size))

dataloaders = train_loader
#dataset = []
#for inputs, labels in dataloaders:
#    dataset.append(inputs)

imgs_filepathes_push = [s[0] for s in train_push_dataset.samples]
imgs_filepathes_train0 = [s[0] for s in train_dataset_0.samples]
imgs_filepathes = imgs_filepathes_push + imgs_filepathes_train0
imgs_filepathes_test = [s[0] for s in test_dataset.samples]

push set size: 724
training_0 set size: 4800
training set size: 5524
test set size: 366


In [None]:
print(imgs_filepathes_push)

['./dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-001.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-002.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-003.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-004.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-005.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-006.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-007.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-008.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-009.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-010.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-011.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-012.png', './dataset/revised_fold1_40x/train/B_train/0_SOB_B_F-14-14134-40-013.png', './dataset/revised_fold1

# Feature Extraction / mapping data to latent space

In [None]:
# Map trin data to featurelatent space

from tqdm.auto import tqdm, trange
%clear model

## Select trained model at epoch with best test/val accuracy as feature extractor
#filepath = model_dir + "6_tensor(0.8711, device='cuda:0', dtype=torch.float64).pt"
#PATH = filepath

# Initialize the model for this run
modelA = initialize_model(model_name, num_classes, feature_extract = False, use_pretrained=True)

model = modelA
model.load_state_dict(torch.load(PATH))
model.to(device)

## Feature extraction

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook
model.avgpool.register_forward_hook(get_activation('avgpool'))
features = [];
for im, label in tqdm(dataloaders):
    im = im.numpy()
    im = np.expand_dims(im, axis=0)
    #x = preprocess_input(x)  #  Search ????????
    x = torch.Tensor(im)
    output = model(x.cuda())
    feat = activation['avgpool']
    F = feat[0,:,0,0]
    features.append(F.cpu().numpy())


[H[2J



  0%|          | 0/5071 [00:00<?, ?it/s]

In [None]:
# Clustering
from sklearn.cluster import KMeans

clusters = KMeans(n_cluster, n_init = 10, random_state = 40) # n_init: Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.
clusters.fit(features)
# Predict the cluster for all the samples
Pred_cluster = clusters.predict(features)
clusters
print('length_preds:', len(Pred_cluster) )



# Cunstruct new folders with cluster-based-labeling data in

In [None]:
import os
import shutil

#torch.save(Pred_cluster, cls_type + '_data_Predicted_cluster')
K = 0
if cls_type == 'M':
   K = n_cluster

data_path_0 = 'datasets_exp_' + str(2*n_cluster) + '/'

n_fold = '1'
magnification = '40x'
fold = 'fold' + n_fold + '_' + magnification
data_path2 = data_path_0 + fold + '/'
if not os.path.exists(data_path_0):
   os.mkdir(data_path_0)

dst_dir_push = data_path2 + 'train_push/'
if not os.path.exists(data_path2):
   os.mkdir(data_path2)

if not os.path.exists(dst_dir_push):
   os.mkdir(dst_dir_push)


cnt_data_push = torch.zeros(n_cluster)
cnt_data_aug = torch.zeros(n_cluster)
for i in range (0, len(Pred_cluster)):
    src_file = imgs_filepathes[i]
    if i < len(train_push_loader):
       if Pred_cluster[i]+ K < 10:
          f_name = '0' + str(Pred_cluster[i]+ K)
       else:
          f_name = str(Pred_cluster[i]+ K)

       dst_dir = dst_dir_push + f_name
       cnt_data_push[Pred_cluster[i]] += 1
    else:
       break

    if not os.path.exists(dst_dir):
       os.mkdir(dst_dir)
    img_name = src_file.split('/')[5]
    dst_file = dst_dir + '/' + img_name
    shutil.copy(src_file, dst_file)
    #print('counter:', i)


print('cnt_data_aug_'+str(cls_type)+':', cnt_data_aug)
print('cnt_data_push_'+str(cls_type)+':', cnt_data_push)


cnt_data_aug_M: tensor([0., 0., 0., 0., 0., 0., 0., 0.])
cnt_data_push_M: tensor([ 92.,  80.,  84.,  96.,  95.,  71., 112.,  94.])


In [None]:
# Map test data to featurelatent space

import os
import shutil

features = [];
for im, label in tqdm(test_loader):
    im = im.numpy()
    im = np.expand_dims(im, axis=0)
    #x = preprocess_input(x)  #  Search ????????
    x = torch.Tensor(im)
    output = model(x.cuda())
    feat = activation['avgpool']
    F = feat[0,:,0,0]
    features.append(F.cpu().numpy())
Pred_cluster_t = clusters.predict(features)

In [None]:
# Test data relabling
import os
import shutil

K = 0
if cls_type == 'M':
   K = n_cluster


#data_path_0 = 'datasets_exp_20/'
data_path_0 = 'datasets_exp_' + str(2*n_cluster) + '/'

n_fold = '1'
magnification = '40x'
fold = 'fold' + n_fold + '_' + magnification
data_path2 = data_path_0 + fold + '/'
if not os.path.exists(data_path_0):
   os.mkdir(data_path_0)

dst_dir_test = data_path2 + 'test/'

if not os.path.exists(dst_dir_test):
   os.mkdir(dst_dir_test)

cnt_data_test = torch.zeros(n_cluster)
for i in range (0, len(Pred_cluster_t)):
    src_file = imgs_filepathes_test[i]
    if Pred_cluster_t[i]+ K < 10:
       f_name = '0' + str(Pred_cluster_t[i]+ K)
    else:
       f_name = str(Pred_cluster_t[i]+ K)

    dst_dir = dst_dir_test + f_name
    cnt_data_test[Pred_cluster_t[i]] += 1

    if not os.path.exists(dst_dir):
       os.mkdir(dst_dir)
    img_name = src_file.split('/')[5]
    dst_file = dst_dir + '/' + img_name
    shutil.copy(src_file, dst_file)
    #print('counter', i)


print('cnt_data_test_'+str(cls_type)+':', cnt_data_test)


cnt_data_test_M: tensor([56., 44., 46., 40., 44., 43., 46., 47.])
