In [64]:
import copy
import json
import math
import os
import random
import shutil
import time
import pandas as pd
import time

import pickle
import hashlib

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data

from scipy.stats import entropy

import sklearn
import copy
import sys

import gc
from torch.utils.data import DataLoader


import torchvision.transforms as transforms
from PIL import Image

#import Verma.experts as vexp
import Verma.losses as vlos
from Verma.utils import AverageMeter, accuracy
import Verma.resnet50 as vres
from AL.utils import *
from AL.metrics import *

import Dataset.Dataset as ds

import ssl_functions as ssl
import active_learning as al
from active_learning import NIHExpertDatasetMemory

import expert as expert_module
import verma as verm
import hemmer as hm

import neptune

import json
import shutil

import glob

In [65]:
def set_seed(seed, fold=None, text=None):
    if fold is not None and text is not None:
        s = text + f" + {seed} + {fold}"
        seed = int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % 10**8
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

In [66]:
import neptune

import json

with open('neptune_config.json', 'r') as f:
    config = json.load(f)

config_neptune = config["neptune"]

In [67]:
import shutil
def cleanTrainDir(path):
    shutil.rmtree(path)

In [68]:
def getExpertModelSSL_AL(dataManager, expert, labelerId, param=None, seed=None, fold=None, learning_mod="ssl", prediction_type="target"):

    if param["SETTING"] == "SSL_AL_SSL":
        learning_type = "ssl"
    elif param["SETTING"] == "SSL_AL":
        learning_type = "sl"

    assert learning_type != "", "Need to define how experts should be trained with new AL data (sl or ssl)"
    
    nih_dataloader = dataManager.getKFoldDataloader(seed)

    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold)
    image_container = nih_dataloader.get_ImageContainer()
    train_dataset = ds.NIHDataset(expert_train, preload=False, preprocess=False, param=param, image_container=image_container)
    val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    sslDataset = dataManager.getSSLDataset(seed)
    usedFilenames = sslDataset.getLabeledFilenames(labelerId, fold)
    
    # initialize data, Erhält alle Indizes der Daten
    all_indices = list(range(len(train_dataset.getAllIndices())))
    all_data_filenames = np.array(train_dataset.getAllFilenames())[all_indices]
    all_data_y = np.array(train_dataset.getAllTargets())[all_indices]

    used_indices = [index for index in all_indices if all_data_filenames[index] in usedFilenames]
    indices = used_indices

    print("Len overlapping used indices: " + str(len(used_indices)))

    metrics = {}

    met = al.testExpert(expert, val_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Val")
    metrics["Val"] = {
        "Start": met,
    }

    met = al.testExpert(expert, test_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Test")
    metrics["Test"] = {
        "Start": met,
    }

    metrics["Train"] = {}

    set_seed(seed, fold, text="")
    
    Intial_random_set = indices
    indices_labeled  = Intial_random_set
    indices_unlabeled= list(set(all_indices) - set(indices_labeled))

    # Lädt die Datasets für die beschrifteten und unbeschrifteten Daten
    dataset_train_unlabeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_unlabeled], all_data_y[indices_unlabeled], None , [0]*len(indices_unlabeled), indices_unlabeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
    dataset_val_unlabeled = NIHExpertDatasetMemory(None, val_dataset.getAllFilenames(), np.array(val_dataset.getAllTargets()), expert.predict , [1]*len(val_dataset.getAllIndices()), val_dataset.getAllIndices(), param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
    
    # Lädt die Dataloaders
    dataLoaderTrainUnlabeled = DataLoader(dataset=dataset_train_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    dataLoaderValUnlabeled = DataLoader(dataset=dataset_val_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    
    for round in range(param["AL"]["ROUNDS"]):

        print(f'\n \n Round {round} \n \n')

        # get points where expert model is least confident on
        indices_confidence = al.get_least_confident_points(expert, dataLoaderTrainUnlabeled, param["AL"]["LABELS_PER_ROUND"], mod="ssl")
        indices_labeled  = indices_labeled + list(indices_confidence) 
        indices_unlabeled= list(set(all_indices) - set(indices_labeled))

        dataset_train_labeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_labeled], all_data_y[indices_labeled], expert.predict , [1]*len(indices_labeled), indices_labeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
        dataset_train_unlabeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_unlabeled], all_data_y[indices_unlabeled], expert.predict , [0]*len(indices_unlabeled), indices_unlabeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)

        dataLoaderTrainLabeled = DataLoader(dataset=dataset_train_labeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
        dataLoaderTrainUnlabeled = DataLoader(dataset=dataset_train_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)

        if learning_type == "ssl": #If the experts should be trained with ssl
            sslDataset = dataManager.getSSLDataset(seed)
            sslDataset.addNewLabels(all_data_filenames[list(indices_confidence)], fold, expert.labelerId)
            emb_model, model = ssl.getExpertModelSSL(labelerId=expert.labelerId, sslDataset=sslDataset, seed=seed, fold_idx=fold, n_labeled=None, embedded_model=None, param=param, neptune_param=param["NEPTUNE"], added_epochs=(round+1)*param["AL"]["SSL_EPOCHS"])
            expert.setModel(expert_module.SSLModel(emb_model, model), mod="SSL")


            #TODO: Test experts and get metrics
            n_images = param["AL"]["INITIAL_SIZE"] + (round+1)*param["AL"]["LABELS_PER_ROUND"]

            train_metrics = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderTrainLabeled, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, step="Train", param=param, mod="ssl", prediction_type="target", print_result=False)
            val_metrics = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderValUnlabeled, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, step="Val", param=param, mod="ssl", prediction_type="target")

            metrics["Train"][n_images] = {
                "train_metrics": train_metrics,
                "val_metrics": val_metrics,
            }

        elif learning_type == "sl": #supervised learning
        
            # train model on labeled data
            dataloaders = (dataLoaderTrainLabeled, dataLoaderValUnlabeled)
            n_images = param["AL"]["INITIAL_SIZE"] + (round+1)*param["AL"]["LABELS_PER_ROUND"]
            train_metrics, val_metrics = al.run_expert(model=None, expert=expert, epochs=param["AL"]["EPOCH_TRAIN"], dataloaders=dataloaders, param=param, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, mod="ssl", prediction_type="target")
        
            metrics["Train"][n_images] = {
                "train_metrics": train_metrics,
                "val_metrics": val_metrics,
            }
    
    dataset_test_unlabeled = NIHExpertDatasetMemory(None, test_dataset.getAllFilenames(), np.array(test_dataset.getAllTargets()), expert.predict , [1]*len(test_dataset.getAllIndices()), test_dataset.getAllIndices(), param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
    dataLoaderVal = DataLoader(dataset=dataset_test_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    met_test = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderVal, id=expert.labelerId, seed=seed, fold=fold, n_images=param["AL"]["INITIAL_SIZE"] + param["AL"]["ROUNDS"]*param["AL"]["LABELS_PER_ROUND"], step="Test", param=param, mod="ssl", prediction_type="target")

    met = al.testExpert(expert, val_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Val")
    metrics["Val"]["End"] = met

    met = al.testExpert(expert, test_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Test")
    metrics["Test"]["End"] = met
    
    #metrics["Test"] = met
    print("AL finished")
    return met_test, metrics, all_data_filenames[indices_labeled]

In [69]:
def getExpertModelsSSL_AL(dataManager, experts, param, seed, fold, learning_mod="ssl", prediction_type="target"):

    if param["SETTING"] == "SSL_AL_SSL":
        learning_type = "ssl"
    elif param["SETTING"] == "SSL_AL":
        learning_type = "sl"

    assert learning_type != "", "Need to define how experts should be trained with new AL data (sl or ssl)"

    nih_dataloader = dataManager.getKFoldDataloader(seed)

    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold)
    image_container = nih_dataloader.get_ImageContainer()
    train_dataset = ds.NIHDataset(expert_train, preload=False, preprocess=False, param=param, image_container=image_container)
    val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    sslDataset = dataManager.getSSLDataset(seed)
    usedFilenames = []
    for labelerId in param["LABELER_IDS"]:
        temp = usedFilenames + sslDataset.getLabeledFilenames(labelerId, fold)
    usedFilenames = temp
    
    
    # initialize data, Erhält alle Indizes der Daten
    all_indices = list(range(len(train_dataset.getAllIndices())))
    all_data_filenames = np.array(train_dataset.getAllFilenames())[all_indices]
    all_data_y = np.array(train_dataset.getAllTargets())[all_indices]

    unused_indices = [index for index in all_indices if all_data_filenames[index] not in usedFilenames]
    
    metrics = {}
    for labelerId, expert in experts.items():
        metrics[labelerId] = {}

        met = al.testExpert(expert, val_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Val")
        metrics[labelerId]["Val"] = {
            "Start": met,
        }

        met = al.testExpert(expert, test_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Test")
        metrics[labelerId]["Test"] = {
            "Start": met,
        }

        metrics[labelerId]["Train"] = {}

    set_seed(seed, fold, text="")

    gc.collect()

    indices_unlabeled = unused_indices
    indices_labeled = list(set(all_indices) - set(indices_unlabeled))

    dataset_train_unlabeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_unlabeled], all_data_y[indices_unlabeled], experts[param["LABELER_IDS"][0]].predict , [0]*len(indices_unlabeled), indices_unlabeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
    dataLoaderTrainUnlabeled = DataLoader(dataset=dataset_train_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    
    for round in range(param["AL"]["ROUNDS"]):

        print(f'\n \n Round {round} \n \n')

        #Try to get better Points
        if param["MOD"] == "disagreement":
            indices_qbq = al.getQbQPoints(experts, dataLoaderTrainUnlabeled, param["AL"]["LABELS_PER_ROUND"], mod="ssl", param=param)
        if param["MOD"] == "disagreement_diff":
            indices_qbq = al.getQbQPointsDifference(experts, dataLoaderTrainUnlabeled, param["AL"]["LABELS_PER_ROUND"], mod="ssl", param=param)
        
        #indices_labeled  = indices_labeled + list(indices_confidence) 
        indices_labeled  = indices_labeled + list(indices_qbq) 
        indices_unlabeled= list(set(all_indices) - set(indices_labeled))     
        
        # train model on labeled data
        for labelerId, expert in experts.items():

            #Val Dataset, needed for SSL and AL
            dataset_val_unlabeled = NIHExpertDatasetMemory(None, val_dataset.getAllFilenames(), np.array(val_dataset.getAllTargets()), expert.predict , [1]*len(val_dataset.getAllIndices()), val_dataset.getAllIndices(), param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
            dataLoaderValUnlabeled = DataLoader(dataset=dataset_val_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)

            #Create train dataset
            dataset_train_labeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_labeled], all_data_y[indices_labeled], expert.predict , [1]*len(indices_labeled), indices_labeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
            dataLoaderTrainLabeled = DataLoader(dataset=dataset_train_labeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)

            if learning_type == "ssl": #If the experts should be trained with ssl
                sslDataset = dataManager.getSSLDataset(seed)
                sslDataset.addNewLabels(all_data_filenames[list(indices_qbq)], fold, labelerId)
                emb_model, model = ssl.getExpertModelSSL(labelerId=labelerId, sslDataset=sslDataset, seed=seed, fold_idx=fold, n_labeled=None, embedded_model=None, param=param, neptune_param=param["NEPTUNE"], added_epochs=(round+1)*param["AL"]["SSL_EPOCHS"])
                experts[labelerId].setModel(expert_module.SSLModel(emb_model, model), mod="SSL")


                #TODO: Test experts and get metrics
                n_images = param["AL"]["INITIAL_SIZE"] + (round+1)*param["AL"]["LABELS_PER_ROUND"]

                train_metrics = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderTrainLabeled, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, step="Train", param=param, mod="ssl", prediction_type="target")
                val_metrics = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderValUnlabeled, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, step="Val", param=param, mod="ssl", prediction_type="target")

                metrics[labelerId]["Train"][n_images] = {
                    "train_metrics": train_metrics,
                    "val_metrics": val_metrics,
                }

                
            elif learning_type == "sl": # If the experts sould be trained with supervised learning

                dataloaders = (dataLoaderTrainLabeled, dataLoaderValUnlabeled)
                n_images = param["AL"]["INITIAL_SIZE"] + (round+1)*param["AL"]["LABELS_PER_ROUND"]
                train_metrics, val_metrics = al.run_expert(model=None, expert=expert, epochs=param["AL"]["EPOCH_TRAIN"], dataloaders=dataloaders, param=param, id=expert.labelerId, seed=seed, fold=fold, n_images=n_images, mod="ssl", prediction_type="target")

                metrics[labelerId]["Train"][n_images] = {
                    "train_metrics": train_metrics,
                    "val_metrics": val_metrics
                }
        
        dataset_train_unlabeled = NIHExpertDatasetMemory(None, all_data_filenames[indices_unlabeled], all_data_y[indices_unlabeled], expert.predict , [0]*len(indices_unlabeled), indices_unlabeled, param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
        dataLoaderTrainUnlabeled = DataLoader(dataset=dataset_train_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    
    dataset_test_unlabeled = NIHExpertDatasetMemory(None, test_dataset.getAllFilenames(), np.array(test_dataset.getAllTargets()), expert.predict , [1]*len(test_dataset.getAllIndices()), test_dataset.getAllIndices(), param=param, preload=param["AL"]["PRELOAD"], image_container=image_container)
    dataLoaderVal = DataLoader(dataset=dataset_test_unlabeled, batch_size=param["AL"]["BATCH_SIZE"], shuffle=True, num_workers=param["num_worker"], pin_memory=True)
    met_test = {}
    for labelerId, expert in experts.items():
        temp = al.metrics_print_expert(model=None, expert=expert, data_loader=dataLoaderVal, id=expert.labelerId, seed=seed, fold=fold, n_images=param["AL"]["INITIAL_SIZE"] + param["AL"]["ROUNDS"]*param["AL"]["LABELS_PER_ROUND"], step="Test", param=param, mod="ssl", prediction_type="target")
        met_test[expert.labelerId] = temp

        met = al.testExpert(expert, val_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Val")
        metrics[labelerId]["Val"]["End"] = met

        met = al.testExpert(expert, test_dataset, image_container, param, learning_mod, prediction_type, seed, fold, data_name="Test")
        metrics[labelerId]["Test"]["End"] = met
        
    return met_test, metrics, all_data_filenames[indices_labeled]

In [70]:
def getExpertsSSL_AL(dataManager, param, fold, seed):

    mod = "ssl"
    prediction_type = param["EXPERT_PREDICT"]

    sslDataset = dataManager.getSSLDataset(seed)

    sslDataset.createLabeledIndices(labelerIds=param["LABELER_IDS"], n_L=param["AL"]["INITIAL_SIZE"], k=round(param["AL"]["INITIAL_SIZE"]*param["OVERLAP"]/100), seed=seed, sample_equal=param["SAMPLE_EQUAL"])

    train_dataloader, val_dataloader, test_dataloader = sslDataset.get_data_loader_for_fold(fold)
    dataloaders = (train_dataloader, val_dataloader, test_dataloader)

    embedded_model = ssl.create_embedded_model(dataloaders, param, param["NEPTUNE"], fold=fold, seed=seed)

    indices = {}
    experts = {}
    for labelerId in param["LABELER_IDS"]:
        nih_expert = expert_module.Expert(dataset = dataManager.getBasicDataset(), labeler_id=labelerId, modus="ssl_al")
        emb_model, model = ssl.getExpertModelSSL(labelerId=labelerId, sslDataset=sslDataset, seed=seed, fold_idx=fold, n_labeled=None, embedded_model=embedded_model, param=param, neptune_param=param["NEPTUNE"])
        nih_expert.setModel(expert_module.SSLModel(emb_model, model), mod="SSL")
        experts[labelerId] = nih_expert
        indices[labelerId] = sslDataset.getLabeledFilenames(labelerId, fold)
    metrics = {}
    indices_labeled = {}
    if param["MOD"] == "confidence":
        for i, labelerId in enumerate(param["LABELER_IDS"]):
            met, metrics_return, labeled = getExpertModelSSL_AL(dataManager=dataManager, expert=experts[labelerId], labelerId=labelerId, param=param, seed=seed, fold=fold, learning_mod="ssl", prediction_type=param["EXPERT_PREDICT"])
            metrics[labelerId] = metrics_return
            indices_labeled[labelerId] = labeled
    elif param["MOD"] == "disagreement" or param["MOD"] == "disagreement_diff":
        met, metrics, indices_labeled = getExpertModelsSSL_AL(dataManager, experts, param, seed, fold, learning_mod="ssl", prediction_type=param["EXPERT_PREDICT"])
        
    return experts, metrics, {"starting labels": indices, "al labels": indices_labeled}

In [71]:
def getExpertsSSL(dataManager, param, fold, seed):

    sslDataset = dataManager.getSSLDataset(seed)

    mod = "ssl"
    prediction_type = param["EXPERT_PREDICT"]

    sslDataset.createLabeledIndices(labelerIds=param["LABELER_IDS"], n_L=param["LABELED"], k=round(param["LABELED"]*param["OVERLAP"]/100), seed=seed, sample_equal=param["SAMPLE_EQUAL"])

    train_dataloader, val_dataloader, test_dataloader = sslDataset.get_data_loader_for_fold(fold)
    dataloaders = (train_dataloader, val_dataloader, test_dataloader)

    ssl.create_embedded_model(dataloaders, param, param["NEPTUNE"], fold=fold, seed=seed)

    torch.cuda.empty_cache()
    gc.collect()

    indices = {}

    experts = {}
    for labelerId in param["LABELER_IDS"]:
        nih_expert = expert_module.Expert(dataset = dataManager.getBasicDataset(), labeler_id=labelerId, modus="ssl")
        emb_model, model = ssl.getExpertModelSSL(labelerId=labelerId, sslDataset=sslDataset, seed=seed, fold_idx=fold, n_labeled=None, embedded_model=None, param=param, neptune_param=param["NEPTUNE"])
        nih_expert.setModel(expert_module.SSLModel(emb_model, model), mod="SSL")
        experts[labelerId] = nih_expert
        indices[labelerId] = sslDataset.getLabeledFilenames(labelerId, fold)

    nih_dataloader = dataManager.getKFoldDataloader(seed)
    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold)
    image_container = nih_dataloader.get_ImageContainer()

    val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    metrics = {}
    for labelerId, expert in experts.items():
        metrics[labelerId] = {}

        met = al.testExpert(expert, val_dataset, image_container, param, mod, prediction_type, seed, fold, data_name="Val")
        metrics[labelerId]["Val"] = {
            "End": met,
        }

        met = al.testExpert(expert, test_dataset, image_container, param, mod, prediction_type, seed, fold, data_name="Test")
        metrics[labelerId]["Test"] = {
            "End": met
        }
        
    return experts, metrics, {"starting labels": indices}

In [72]:
def setupEmbeddedModel(dataManager, param, fold, seed):
    sslDataset = dataManager.getSSLDataset(seed)

    mod = "ssl"
    prediction_type = param["EXPERT_PREDICT"]

    train_dataloader, val_dataloader, test_dataloader = sslDataset.get_data_loader_for_fold(fold)
    dataloaders = (train_dataloader, val_dataloader, test_dataloader)

    ssl.create_embedded_model(dataloaders, param, param["NEPTUNE"], fold=fold, seed=seed)

    torch.cuda.empty_cache()
    gc.collect()

In [73]:
def getExpertsAL(dataManager, param, fold_idx, seed):
    nih_dataloader = dataManager.getKFoldDataloader(seed)
    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold_idx)
    image_container = nih_dataloader.get_ImageContainer()
    expert_train_dataset = ds.NIHDataset(expert_train, preload=False, preprocess=False, param=param, image_container=image_container)
    expert_val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    expert_test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    setupEmbeddedModel(dataManager, param, fold_idx, seed)
    #Get init labeled indices with k same images and n-k different images
    #k=None means random indieces
    k = param["OVERLAP"]
    all_indices = list(range(len(expert_train_dataset.getAllIndices())))
    #If no k is set than it selects one randomly
    k = round(param["AL"]["INITIAL_SIZE"]*k/100)
    if param["NEPTUNE"]["NEPTUNE"]:
        run["param/overlap_k"] = k
    indices = al.sampleIndices(n = param["AL"]["INITIAL_SIZE"], k = k, all_indices = all_indices, experten = list(param["LABELER_IDS"]), seed = seed, fold=fold_idx)

    if param["NEPTUNE"]["NEPTUNE"]:
        run[f"Seed_{seed}/Fold_{fold_idx}/Experts/Indices"] = indices

    print("Random indices:")
    print(indices)

    labeld_filenames = {}

    indeces_al = {}

    experts = {}
    metrics = {}
    for i, labelerId in enumerate(list(param["LABELER_IDS"])):
        nih_expert = expert_module.Expert(dataset = dataManager.getBasicDataset(), labeler_id=labelerId, modus="al")
        experts[labelerId] = nih_expert
        print("DELETE ME")
        print("Drawn indices")
        print(indices)
        print(f"Len of all filenames: {len(expert_train_dataset.getAllFilenames())}")
        print("All indices")
        print(expert_train_dataset.getAllIndices())
        labeld_filenames[labelerId] = np.array(expert_train_dataset.getAllFilenames())[indices[labelerId]]
        if param["MOD"] == "confidence":
            expert_model, met_test, metric, indices_labeled = al.getExpertModel(indices[labelerId], expert_train_dataset, expert_val_dataset, expert_test_dataset, nih_expert, param, seed, fold_idx, image_container=image_container, learning_mod="al", prediction_type=param["EXPERT_PREDICT"])
            nih_expert.setModel(expert_model, mod="AL")
            metrics[labelerId] = metric
            indeces_al[labelerId] = indices_labeled
    if param["MOD"] == "disagreement" or param["MOD"]=="disagreement_diff":
        expert_models, met, metrics, indeces_al = al.getExpertModels(indices, experts, expert_train_dataset, expert_val_dataset, expert_test_dataset, param, seed, fold_idx, mod=param["MOD"], image_container=image_container, learning_mod="al", prediction_type=param["EXPERT_PREDICT"])
        for labelerId, expert in experts.items():
            expert.setModel(expert_models[labelerId], mod="AL")

    return experts, metrics, {"starting labels": labeld_filenames, "al labels": indeces_al}

In [74]:
def getExpertsNormal(dataManager, param, fold_idx, seed):
    nih_dataloader = dataManager.getKFoldDataloader(seed)
    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold_idx)
    image_container = nih_dataloader.get_ImageContainer()
    expert_train_dataset = ds.NIHDataset(expert_train, preload=False, preprocess=False, param=param, image_container=image_container)
    expert_val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    expert_test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    setupEmbeddedModel(dataManager, param, fold_idx, seed)
    
    #Get init labeled indices with k same images and n-k different images
    #k=None means random indieces
    k = param["OVERLAP"]
    all_indices = list(range(len(expert_train_dataset.getAllIndices())))
    #If no k is set than it selects one randomly
    k = round(param["LABELED"]*k/100)
    if param["NEPTUNE"]["NEPTUNE"]:
        run["param/overlap_k"] = k
    indices = al.sampleIndices(n = param["LABELED"], k = k, all_indices = all_indices, experten = list(param["LABELER_IDS"]), seed = seed, fold=fold_idx)

    if param["NEPTUNE"]["NEPTUNE"]:
        run[f"Seed_{seed}/Fold_{fold_idx}/Experts/Indices"] = indices

    print("Random indices:")
    print(indices)

    labeled_filenames = {}

    experts = {}
    #Create the experts
    metrics = {}
    for i, labelerId in enumerate(list(param["LABELER_IDS"])):
        nih_expert = expert_module.Expert(dataset = dataManager.getBasicDataset(), labeler_id=labelerId, modus="normal")
        experts[labelerId] = nih_expert
        labeld_filenames[labelerId] = np.array(expert_train_dataset.getAllFilenames())[indices[labelerId]]

        model, met, metric = al.getExpertModelNormal(indices[labelerId], expert_train_dataset, expert_val_dataset, expert_test_dataset, nih_expert, param, seed, fold_idx, image_container=image_container, learning_mod="al", prediction_type=param["EXPERT_PREDICT"])
        nih_expert.setModel(model, mod="AL")
        metrics[labelerId] = metric

    return experts, metrics, {"starting labels": labeled_filenames}

In [75]:
def getExpertsPerfect(dataManager, param, fold, seed):

    experts = {}
    for i, labelerId in enumerate(list(param["LABELER_IDS"])):
        nih_expert = expert_module.Expert(dataset = dataManager.getBasicDataset(), labeler_id=labelerId, modus="perfect")
        experts[labelerId] = nih_expert


    sslDataset = dataManager.getSSLDataset(seed)

    mod = "perfect"
    prediction_type = param["EXPERT_PREDICT"]

    torch.cuda.empty_cache()
    gc.collect()

    nih_dataloader = dataManager.getKFoldDataloader(seed)
    expert_train, expert_val, expert_test = nih_dataloader.get_dataset_for_folder(fold)
    image_container = nih_dataloader.get_ImageContainer()

    val_dataset = ds.NIHDataset(expert_val, preload=False, preprocess=False, param=param, image_container=image_container)
    test_dataset = ds.NIHDataset(expert_test, preload=False, preprocess=False, param=param, image_container=image_container)

    metrics = {}
    for labelerId, expert in experts.items():
        metrics[labelerId] = {}

        met = al.testExpert(expert, val_dataset, image_container, param, mod, prediction_type, seed, fold, data_name="Val")
        metrics[labelerId]["Val"] = {
            "End": met,
        }

        met = al.testExpert(expert, test_dataset, image_container, param, mod, prediction_type, seed, fold, data_name="Test")
        metrics[labelerId]["Test"] = {
            "End": met
        }

    return experts, metrics

In [76]:
def getExperts(dataManager, param, seed, fold):
      
    #Creates expert models for the choosen method
    if param["SETTING"] == "PERFECT":
        experts, metrics = getExpertsPerfect(dataManager, param, fold, seed)
    if param["SETTING"] == "AL":
        experts, metrics, labeled_filenames = getExpertsAL(dataManager, param, fold, seed)
    elif param["SETTING"] == "SSL":
        experts, metrics, labeled_filenames = getExpertsSSL(dataManager, param, fold, seed)
    elif param["SETTING"] == "SSL_AL" or param["SETTING"] == "SSL_AL_SSL":
        experts, metrics, labeled_filenames = getExpertsSSL_AL(dataManager, param, fold, seed)
    elif param["SETTING"] == "NORMAL":
        experts, metrics, labeled_filenames = getExpertsNormal(dataManager, param, fold, seed)

    return experts, metrics, labeled_filenames

In [77]:
def L2D_Verma(train_loader, val_loader, test_loader, full_dataloader, expert_fns, param, seed, fold_idx, experts):
    num_experts = len(expert_fns)
            
    model = model = vres.ResNet50_defer(int(param["n_classes"]) + num_experts)
    if torch.cuda.device_count() > 1:
        print("Use ", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    metrics_train_all, metrics_val_all, metrics_test, metrics_full, metrics_pretrain_all = verm.train(model, train_loader, val_loader, test_loader, expert_fns, param, seed=seed, experts=experts, 
                                                                                fold=fold_idx, full_dataloader=full_dataloader, param=param)

    return metrics_train_all, metrics_val_all, metrics_test, metrics_full, metrics_pretrain_all

In [78]:
def one_run(dataManager, run_param, all_metrics, print_text, run_metrics, count, current_index=None):
    """
    Computes all seed-fold combinations for one parameter combination and saves the metrics into a file
    
    Param:
        dataManager: DataManager for all data
        run_param: dict of all relevant parameters for this run
        all_metrics: list which contains all already computed results
        print_text: output text to print the current paramater combination
        run_metrics: core parameters for this run (which vary over different runs)
        count: integer to identify the save file (and number of runs)
        current_index: index of the current run in all_metrics, if it exists
    """

    #Get device for cuda training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #To ensure to only print the run text only one time
    printed = False

    #Metrics for this run
    expert_metrics = {}
    verma_metrics = {}
    hemmer_metrics = {}

    #Checks if there is data for this run in the save files
    if current_index is not None:
        #Load the current metrics
        print(f"Current index: {current_index}")
        current_metric = all_metrics[current_index]

        #Save the already computed metrics in the working directories
        expert_metrics = current_metric["expert metrics"]
        verma_metrics = current_metric["verma"]
        hemmer_metrics = current_metric["hemmer"]
    #If not, create new element in list of all metrics
    else:
        all_metrics.append(run_metrics)
        

    #Iterate over all seeds
    for seed in run_param["SEEDS"]:

        #If this seed is not already in the save file
        if seed not in expert_metrics.keys():
            print(f"New seed: {seed}")
            expert_metrics[seed] = {}
            verma_metrics[seed] = {}
            hemmer_metrics[seed] = {}

        #Iterate over the folds
        #for fold_idx in range(run_param["K"]):
        for fold_idx in range(2):

            #Check if the seed-fold combination is already in the save files
            if fold_idx in expert_metrics[seed].keys():
                continue
            else:
                print(f"Keys: {expert_metrics[seed].keys()}")
                print(f"New fold: {fold_idx}")

            #Print run text if at least one computation is made for this parameter combination (run)
            if not printed:
                print(print_text)
                printed = True

            
            if run_param["cluster"]: #Keep the embedded model in cluster training
                if os.path.isdir(f'{run_param["Parent_PATH"]}/SSL_Working/SSL'):
                    cleanTrainDir(f'{run_param["Parent_PATH"]}/SSL_Working/SSL')
            else: #delete everything if space is limited
                if os.path.isdir(f'{run_param["Parent_PATH"]}/SSL_Working'):
                    cleanTrainDir(f'{run_param["Parent_PATH"]}/SSL_Working')

            if seed != "":
                set_seed(seed, fold_idx, text="")

            print("/n")
            print(f"Seed: {seed} - Fold: {fold_idx} \n")

            #if os.path.isdir(f'{run_param["Parent_PATH"]}/SSL_Working/NIH/EmbeddingCM_bin'):
            #    cleanTrainDir(f'{run_param["Parent_PATH"]}SSL_Working/NIH/EmbeddingCM_bin')

            neptune = {
                "SEED": seed,
                "FOLD": fold_idx,
            }

            torch.cuda.empty_cache()
            gc.collect()

            experts, expert_metric, labeled_filenames = getExperts(dataManager, run_param, seed, fold_idx)
            expert_metrics[seed][fold_idx] = expert_metric

            torch.cuda.empty_cache()
            gc.collect()

            #print(f"Got {len(experts)} experts")

            nih_dataloader = dataManager.getKFoldDataloader(seed=seed)

            train_loader, val_loader, test_loader = nih_dataloader.get_data_loader_for_fold(fold_idx)
            full_dataloader = nih_dataloader.getFullDataloader()

            expert_fns = []
            print(run_param["SETTING"])
            for labelerId, expert in experts.items():
                if run_param["SETTING"] == "AL":
                    expert.init_model_predictions(full_dataloader, mod="AL")
                    expert_fns.append(expert.predict_model_predefined_al)
                elif run_param["SETTING"] == "SSL":
                    expert.init_model_predictions(full_dataloader, mod="SSL")
                    expert_fns.append(expert.predict_model_predefined_ssl)
                elif (run_param["SETTING"] == "SSL_AL" or run_param["SETTING"] == "SSL_AL_SSL"):
                    expert.init_model_predictions(full_dataloader, mod="SSL")
                    expert_fns.append(expert.predict_model_predefined_ssl)
                elif run_param["SETTING"] == "NORMAL":
                    expert.init_model_predictions(full_dataloader, mod="AL")
                    expert_fns.append(expert.predict_model_predefined_al)
                elif run_param["SETTING"] == "PERFECT":
                    expert_fns.append(expert.predict)

            print("DELETE ME")
            return experts, dataManager, labeled_filenames

            metrics_train_all, metrics_val_all, metrics_test_all, metrics_full_all, metrics_pretrain_all = L2D_Verma(train_loader, val_loader, test_loader, full_dataloader, expert_fns, run_param, seed, fold_idx, experts=experts)

            verma_metrics[seed][fold_idx] = {
                "train": metrics_train_all,
                "val": metrics_val_all,
                "test": metrics_test_all,
                "full": metrics_full_all,
                "pretrain": metrics_pretrain_all,
            }
            
            system_accuracy, classifier_coverage, all_train_metrics, all_val_metrics, all_test_metrics, all_full_metrics = hm.L2D_Hemmer(train_loader, val_loader, test_loader, full_dataloader, expert_fns, run_param, seed, fold_idx, experts)

            hemmer_metrics[seed][fold_idx] = {
                "train": all_train_metrics,
                "val": all_val_metrics,
                "test": all_test_metrics,
                "full": all_full_metrics,
            }

            run_metrics["expert metrics"] = expert_metrics
            run_metrics["verma"] = verma_metrics
            run_metrics["hemmer"] = hemmer_metrics

            #Write only into new file if a new run was computed
            temp_count = count
            if current_index is not None:
                all_metrics[current_index] = run_metrics
                temp_count = count - 1
            else:
                all_metrics[-1] = run_metrics
            with open(f'{run_param["Parent_PATH"]}/Metrics_Folder/Metrics_{temp_count}.pickle', 'wb') as handle:
                pickle.dump(all_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return expert_metrics, verma_metrics, hemmer_metrics

In [79]:
def run_experiment(param):
    run_param = copy.deepcopy(param)

    runs = None

    expert_metrics_all = []

    count = 0

    list_of_files = glob.glob(f'{param["Parent_PATH"]}/Metrics_Folder/*') # * means all if need specific format then *.csv
    
    if len(list_of_files) >= 1:
        latest_file = max(list_of_files, key=os.path.getctime)
      
        print(f"Open metrics file: {latest_file}")

        with open(latest_file, 'rb') as handle:
            expert_metrics_all = pickle.load(handle)

        runs = [{i:run[i] for i in run if i not in ["expert metrics", "verma", "hemmer"]} for run in expert_metrics_all]

        if "pickle" in latest_file:

            count = int(latest_file.split("/")[-1][8:-7]) + 1

    #Every pair of labeler ids
    for labeler_ids in param["LABELER_IDS"]:
        run_param["LABELER_IDS"] = labeler_ids
        run_param["labeler_ids"] = convert_ids_to_string(labeler_ids)
        

        dataManager = ds.DataManager(path=param["PATH"], target=param["TARGET"], param=run_param, seeds=param["SEEDS"])
        dataManager.createData()

        for init_size in param["AL"]["INITIAL_SIZE"]:
            run_param["AL"]["INITIAL_SIZE"] = init_size

            for labels_per_round in param["AL"]["LABELS_PER_ROUND"]:
                run_param["AL"]["LABELS_PER_ROUND"] = labels_per_round

                for rounds in param["AL"]["ROUNDS"]:
                    run_param["AL"]["ROUNDS"] = rounds

                    labeled = init_size + rounds * labels_per_round

                    run_param["LABELED"] = labeled

                    if (labeled >= 128): #Prevents from large amount of data
                        continue

                    for cost in param["AL"]["COST"]:
                        run_param["AL"]["COST"] = cost
                        run_param["AL"]["cost"] = convert_cost_to_string(cost)

                        for overlap in param["OVERLAP"]:
                            run_param["OVERLAP"] = overlap

                            for setting in param["SETTING"]:
                                run_param["SETTING"] = setting
                        
                                for mod in param["MOD"]:
                                    run_param["MOD"] = mod

                                    if ((setting == "AL"  or setting=="SSL_AL" or setting=="SSL_AL_SSL") and (mod not in ["confidence", "disagreement", "disagreement_diff"])):
                                        continue

                                    if (setting == "SSL" and mod != "ssl"):
                                        continue

                                    if (setting == "NORMAL" and mod != "normal"):
                                        continue

                                    for expert_predict in param["EXPERT_PREDICT"]:
                                        run_param["EXPERT_PREDICT"] = expert_predict

                                        if ((setting == "SSL" or setting == "SSL_AL" or setting == "SSL_AL_SSL") and (expert_predict == "right")):
                                            continue

                                        if (expert_predict == "target") and (cost != param["AL"]["COST"][0]):
                                            continue
                                        if (expert_predict == "target"):
                                            run_param["AL"]["cost"] = convert_cost_to_string((0, 0))

                                        for sample_equal in param["SAMPLE_EQUAL"]:
                                            run_param["SAMPLE_EQUAL"] = sample_equal

                                            for epochs_pretrain in param["epochs_pretrain"]:
                                                run_param["epochs_pretrain"] = epochs_pretrain

                                                metrics_save = {}
                                                metrics_save["labeler_ids"] = labeler_ids
                                                metrics_save["init_size"] = init_size
                                                metrics_save["labels_per_round"] = labels_per_round
                                                metrics_save["rounds"] = rounds
                                                metrics_save["labeled"] = labeled
                                                metrics_save["cost"] = cost
                                                metrics_save["overlap"] = overlap
                                                metrics_save["setting"] = setting
                                                metrics_save["mod"] = mod
                                                metrics_save["expert_predict"] = expert_predict
                                                metrics_save["sample_equal"] = sample_equal
                                                metrics_save["epochs_pretrain"] = epochs_pretrain

                                                
                                                current_index = None
                                                
                                                #Compute the current index
                                                if runs is not None:
                                                    #If this parameter compination is in the already done runs
                                                    if metrics_save in runs:
                                                        #Get index of this combination
                                                        current_index = runs.index(metrics_save)
                                                        print(f"Current index: {current_index}")
                            
                                                NEPTUNE = param["NEPTUNE"]["NEPTUNE"]
                                                if param["NEPTUNE"]["NEPTUNE"]:
                                                    global run
                                                    run = neptune.init_run(
                                                        project=config_neptune["project"],
                                                        api_token=config_neptune["api_token"],
                                                        #custom_run_id="AL_" + 
                                                    )
                                                    run["param"] = run_param
                                                    run_param["NEPTUNE"]["RUN"] = run

                                                print_text = f"""\n \n \n #############################################################
                                                NEW RUN

                                                Initial size: {init_size}
                                                Batch size AL: {labels_per_round}
                                                Max rounds: {rounds}
                                                Labeled images: {labeled}
                                                Cost: {cost}
                                                Setting: {setting}
                                                Mod: {mod}
                                                Overlap: {overlap}
                                                Prediction Type: {expert_predict}
                                                Sample equal: {sample_equal}
                                                Epochs pretrain: {epochs_pretrain}
                                                """

                                                start_time = time.time()
                                                #dataManager, run_param, all_metrics, print_text, run_metrics, count, current_index=None
                                                #expert_metrics, verma_metrics, hemmer_metrics = one_run(dataManager, run_param, expert_metrics_all.copy(), print_text, metrics_save,
                                                #                                                       count, current_index)

                                                print("DELETE ME")
                                                return one_run(dataManager, run_param, expert_metrics_all.copy(), print_text, metrics_save, count, current_index)
                                                
                                                print("--- %s seconds ---" % (time.time() - start_time))

                                                metrics_save["expert metrics"] = expert_metrics
                                                metrics_save["verma"] = verma_metrics
                                                metrics_save["hemmer"] = hemmer_metrics
                                                ensure_count = 0 #Helps to save into the correct file if metrics are added to a run
                                                if current_index is not None:
                                                    expert_metrics_all[current_index] = metrics_save
                                                    ensure_count = 1
                                                else:
                                                    expert_metrics_all.append(metrics_save)
                                                with open(f'{param["Parent_PATH"]}/Metrics_Folder/Metrics_{count - ensure_count}.pickle', 'wb') as handle:
                                                    pickle.dump(expert_metrics_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
                                                if current_index is None:
                                                    count += 1
                                                if param["NEPTUNE"]["NEPTUNE"]:
                                                    run["metrics"] = metrics_save

                                                    run.stop()
                                                return

    return expert_metrics_all

In [80]:
def convert_cost_to_string(tp):
    return "(" + str(tp[0]) + ", " + str(tp[1]) + ")"

def convert_ids_to_string(ids):
    return f"{ids[0]}, {ids[1]}"

def convert_list_to_string(li):
    result = "["
    for el in li[:-2]:
        result = result + str(el)
    result = result + "]"
    return 

In [81]:
path = os.getcwd()
path

'/home/joli/Masterarbeit'

In [82]:
def main(args):

    path = args[0]

    num_worker = 4
    if len(args) >= 2:
        num_worker = int(args[1])

    if "liebschner" not in path and "joli" not in path:
        return

    param = {
        "PATH": f"{path}/Datasets/NIH/",
        "Parent_PATH": path,
        "TARGET": "Airspace_Opacity",
        #"LABELER_IDS": [[4323195249, 4295232296]],
        "LABELER_IDS": [[4295349121, 4295342357]],
        "K": 10, #Number of folds
        #"SEEDS": [1, 2, 3, 4, 42], #Seeds for the experiments
        "SEEDS": [1], #Seeds for the experiments
        "GT": True, # Determines if the classifier gets all data with GT Label or only the labeld data
        #"MOD": ["confidence", "disagreement", "disagreement_diff", "ssl", "normal"], #Determines the experiment modus
        "MOD": ["confidence"],

        "OVERLAP": [0, 100],
        "SAMPLE_EQUAL": [False, True],

        #"SETTING": ["AL", "SSL", "SSL_AL", "NORMAL", "SSL_AL_SSL"],
        "SETTING": ["SSL_AL"],

        "NUM_EXPERTS": 2,
        "NUM_CLASSES": 2,

        "EXPERT_PREDICT": ["right", "target"],

        "AL": { #Parameter for Active Learning
            "INITIAL_SIZE": [16, 32], #
            "EPOCH_TRAIN": 40, #
            "n_dataset": 2, #Number Classes
            "BATCH_SIZE": 4,
            "BATCH_SIZE_VAL": 32,
            "ROUNDS": [2, 4, 8],
            "LABELS_PER_ROUND": [4, 8, 16],
            "EPOCHS_DEFER": 10,
            "COST": [(0, 0), (5, 0)], #Cost for Cost sensitiv learning
            #"TRAIN REJECTOR": False,
            "PRELOAD": True,
            "PREPROCESS": True,
            "SSL_EPOCHS": 3
        
        },
        "SSL": {
            "PREBUILD": False,
            #"TRAIN_BATCH_SIZE": 128,
            "TRAIN_BATCH_SIZE": 254,
            "TEST_BATCH_SIZE": 254,
            "N_EPOCHS": 5, #number of training epoches
            "BATCHSIZE": 16, #train batch size of labeled samples
            #"N_IMGS_PER_EPOCH": 32768, #number of training images for each epoch
            "N_IMGS_PER_EPOCH": 4381*1, #number of training images for each epoch
        },
        "L2D": { # Parameter for Learning to defer
            "TRAIN_BATCH_SIZE": 128,
            "TEST_BATCH_SIZE": 128,
            "PRELOAD": True,
            "PREBUILD": True,
            "EPOCHS": 50,
            "VERMA": {},
            "HEMMER": {
                "EPOCHS": 50,
                "LR": 5e-3,
                "USE_LR_SCHEDULER": False,
                "DROPOUT": 0.00,
                "NUM_HIDDEN_UNITS": 30,
            },
        
        },
        "NEPTUNE": {
            "NEPTUNE": False,
        },
        "EMBEDDED": {
            "ARGS": {
                'dataset': "nih",
                'model': "resnet50",
                'num_classes': 2,
                'batch': 128,
                'lr': 0.001,
            },
            "EPOCHS": 30,
        },
    
    
        "epochs_pretrain": [0],
        "batch_size": 64,
        "alpha": 1.0, #scaling parameter for the loss function, default=1.0
        "epochs": 50,
        "patience": 35, #number of patience steps for early stopping the training
        "expert_type": "MLPMixer", #specify the expert type. For the type of experts available, see-> models -> experts. defualt=predict
        "n_classes": 2, #K for K class classification
        "k": 0, #
        "n_experts": 2, #
        "lr": 0.001, #learning rate
        "weight_decay": 5e-4, #
        "warmup_epochs": 5, #
        #"loss_type": "softmax", #surrogate loss type for learning to defer
        "loss_type": "ova",
        "ckp_dir": f"{path}/Models", #directory name to save the checkpoints
        "experiment_name": "multiple_experts", #specify the experiment name. Checkpoints will be saved with this name

        #Params for cluster training
        "num_worker": num_worker,
        "cluster": False
    }

    return run_experiment(param)

    expert_metrics_all = run_experiment(param)

In [None]:
experts, dataManager, labeled_filenames = main(["/home/joli", 0])

In [281]:
basicDataset = dataManager.getBasicDataset()
basicData = basicDataset.getData().copy()
basicData.head()
labelerIds = basicData.columns[3:]

Unnamed: 0,Patient ID,Image ID,GT,4295367682,4295232296,4323195249,4295194124,4325222456,4295354140,4295206903,...,4295246212,4295354117,4326615889,4322936986,4295369079,4295344101,4325221191,4295349028,4295393403,4295354708
0,13,00000013_008.png,1,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,13,00000013_026.png,0,-1.0,-1.0,0.0,0.0,0.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,32,00000032_002.png,1,-1.0,-1.0,1.0,0.0,-1.0,0.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,32,00000032_009.png,1,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,32,00000032_011.png,1,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4376,30709,00030709_002.png,0,0.0,-1.0,0.0,0.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4377,30719,00030719_000.png,0,-1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4378,30752,00030752_000.png,0,-1.0,-1.0,0.0,0.0,0.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4379,30759,00030759_000.png,0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [291]:
def equal_labeled_images(data, experts):
    """
    Returns dataframe with only images which are labeled from every given expert
    """
    for expertId in experts:
        data = data[data[expertId] != -1].copy()
    return data[["Patient ID", "Image ID", "GT"] + experts]

def number_labeled_images(data, experts):
    """
    Returns number of images which every given expert has labeled
    """
    return len(equal_labeled_images(data, experts))

def get_pair_labeled(data, Ids):
    """
    Returns matrix which contains the number of labeled images for every pair of experts
    """
    images_dict = {}
    for i in Ids:
        labeler_dict = {}
        for j in Ids:
            labeler_dict[j] = number_labeled_images(basicData, [i, j])
        images_dict[i] = labeler_dict
    return pd.DataFrame(images_dict)

pair_labeled = get_pair_labeled(basicData, labelerIds)
pair_labeled

Unnamed: 0,4295367682,4295232296,4323195249,4295194124,4325222456,4295354140,4295206903,4326829894,4295376896,4295349121,...,4295246212,4295354117,4326615889,4322936986,4295369079,4295344101,4325221191,4295349028,4295393403,4295354708
4295367682,536,9,536,319,0,0,74,0,0,0,...,0,0,0,24,0,0,90,0,0,0
4295232296,9,853,852,162,0,58,256,253,0,0,...,0,0,0,0,0,0,98,0,12,0
4323195249,536,852,2320,936,162,212,593,295,100,0,...,0,28,32,190,0,64,255,20,12,0
4295194124,319,162,936,938,64,122,149,8,101,0,...,0,0,0,0,0,0,11,0,0,0
4325222456,0,0,162,64,162,6,69,0,0,0,...,0,0,0,0,0,0,23,0,0,0
4295354140,0,58,212,122,6,212,0,0,0,0,...,0,0,0,0,0,0,23,0,0,0
4295206903,74,256,593,149,69,0,594,31,0,0,...,0,0,0,0,0,0,10,0,0,0
4326829894,0,253,295,8,0,0,31,296,0,0,...,0,0,0,0,0,0,0,0,0,0
4295376896,0,0,100,101,0,0,0,0,101,0,...,0,0,0,0,0,0,0,0,0,0
4295349121,0,0,0,0,0,0,0,0,0,1674,...,425,758,265,0,0,0,0,39,0,23


### New functions for saving the labels from experts

In [None]:
def save_expert_labels(experts, labeled_images):
    pass

### New functions for training without gt

In [None]:
def create_artificial_gt(dataManager, train_dataloader, experts, fold, seed, method="perfect"):
    if method == "perfect":
        return train_dataloader

In [None]:
def create_label_df(dataset, experts):
    """
    Creates a df with filename, gt, [artificial expert predictions]
    """
    gt = pd.DataFrame({"filename": dataset.getAllFilenames(), "gt": dataset.getAllTargets()})
    result = gt.copy()
    for expert_id, expert in experts.items():
        if "ssl" in expert.modus:
            expert_df = pd.DataFrame({"filename": expert.prebuild_filenames_ssl, f"prediction_{expert_id}": expert.prebuild_predictions_ssl})
        else:
            expert_df = pd.DataFrame({"filename": expert.prebuild_filenames_al, f"prediction_{expert_id}": expert.prebuild_predictions_al})
        result = pd.merge(result, expert_df)
    return result

In [270]:
def create_true_expert_labels(dataset, experts):
    """
    Creates a df with filename, gt, [true expert predictions]
    """
    gt = pd.DataFrame({"filename": dataset.getAllFilenames(), "gt": dataset.getAllTargets()})
    result = gt.copy()
    for expert_id, expert in experts.items():
        expert_df = expert.predictions.reset_index(names="filename")
        result = pd.merge(result, expert_df)
    return result

In [272]:
def filter_labeled(df, labeled):
    return df[df["filename"].isin(labeled)]

In [None]:
fullDataset = dataManager.getKFoldDataloader(1).getFullDataloader().dataset
fullDataset

Create df with filename, gt, and artificial expert predictions

In [None]:
label_df = create_label_df(fullDataset, experts)
label_df.head(7)

print(f'Accuracy expert 1: {len(label_df[label_df["gt"] == label_df["prediction_4295349121"]])/len(label_df)*100}%')
print(f'Accuracy expert 2: {len(label_df[label_df["gt"] == label_df["prediction_4295342357"]])/len(label_df)*100}%')

The corresponding real expert predictions

In [274]:
true_experts = create_true_expert_labels(fullDataset, experts)
true_experts

Unnamed: 0,filename,gt,4295349121,4295342357
0,00000119_001.png,0,0.0,0.0
1,00000134_000.png,1,1.0,0.0
2,00000135_001.png,1,0.0,0.0
3,00000156_000.png,1,1.0,1.0
4,00000156_001.png,1,1.0,1.0
...,...,...,...,...
1630,00019643_019.png,1,1.0,0.0
1631,00019643_020.png,0,0.0,0.0
1632,00019643_021.png,0,0.0,0.0
1633,00019643_022.png,0,0.0,0.0


Processing the list of labeled images

In [273]:
result = np.empty(0, dtype="str")
for key, item in labeled_list["ssl_al"]["al labels"].items():
    result = np.append(result, item)
result

array(['00012233_001.png', '00001792_003.png', '00015698_001.png',
       '00001722_004.png', '00013443_001.png', '00001470_000.png',
       '00018546_002.png', '00018557_011.png', '00019260_007.png',
       '00013601_003.png', '00012184_002.png', '00012387_001.png',
       '00018829_005.png', '00016052_006.png', '00018019_005.png',
       '00014558_011.png', '00014715_011.png', '00001449_002.png',
       '00016887_000.png', '00019643_007.png', '00019240_005.png',
       '00012010_005.png', '00013073_006.png', '00012010_035.png',
       '00001722_004.png', '00013844_001.png', '00019576_023.png',
       '00019643_004.png', '00018546_006.png', '00012010_026.png',
       '00012219_007.png', '00013916_002.png', '00012543_010.png',
       '00016561_003.png', '00018250_000.png', '00013613_013.png',
       '00018335_012.png', '00019576_060.png', '00013601_022.png',
       '00013613_016.png', '00018329_007.png', '00001249_004.png'],
      dtype='<U16')

Filter expert predictions for only labeled images

In [276]:
filter_labeled(true_experts, result).head(5)

Unnamed: 0,filename,gt,4295349121,4295342357
21,00001249_004.png,1,1.0,0.0
34,00001449_002.png,0,0.0,0.0
35,00001470_000.png,0,0.0,0.0
64,00001722_004.png,0,0.0,0.0
70,00001792_003.png,0,0.0,0.0


<Dataset.Dataset.NIHDataset at 0x7f4c2b5a0400>

In [52]:
expert_train, expert_val, expert_test = dataManager.getKFoldDataloader(1).get_dataset_for_folder(0)

{4295349121: [440, 112, 28, 357, 759, 108, 463, 256, 675, 703, 101, 33, 663, 16, 198, 183], 4295342357: [701, 770, 503, 32, 776, 577, 733, 248, 384, 186, 729, 249, 591, 268, 174, 93]}

In [86]:
labeled_list["ssl_al"] = labeled_filenames

In [265]:
label_df = create_label_df(fullDataset, experts)
label_df.head(7)

print(f'Accuracy expert 1: {len(label_df[label_df["gt"] == label_df["prediction_4295349121"]])/len(label_df)*100}%')
print(f'Accuracy expert 2: {len(label_df[label_df["gt"] == label_df["prediction_4295342357"]])/len(label_df)*100}%')

Accuracy expert 1: 68.68501529051989%
Accuracy expert 2: 56.513761467889914%


In [266]:
def save_expert_labels(labels_df, param, path):
    if path[-1] != "/":
        path = path + "/"
    name = f"labeler_{}_seed_{}_fold_{}_gt_{param["GT"]}_training_{param["SETTING"]}_mod_{param["MOD"]}_overlap_{param["OVERLAP"]}_sample_equal_{param["SAMPLE_EQUAL"]}_labeled_{param["LABELED"]}_initSize_{param["AL"]["INITIAL_SIZE"]}_rounds_{param["AL"]["ROUNDS"]}_labelsPerRound_{param["AL"]["LABELS_PER_ROUND"]}"
    labels_df.to_csv(f"{path}labels_{name}.csv")

SyntaxError: f-string: empty expression not allowed (2971499285.py, line 4)

In [207]:
labeled_list["ssl_al"]["al labels"]

{4295349121: array(['00012233_001.png', '00001792_003.png', '00015698_001.png',
        '00001722_004.png', '00013443_001.png', '00001470_000.png',
        '00018546_002.png', '00018557_011.png', '00019260_007.png',
        '00013601_003.png', '00012184_002.png', '00012387_001.png',
        '00018829_005.png', '00016052_006.png', '00018019_005.png',
        '00014558_011.png', '00014715_011.png', '00001449_002.png',
        '00016887_000.png', '00019643_007.png', '00019240_005.png',
        '00012010_005.png'], dtype='<U16'),
 4295342357: array(['00013073_006.png', '00012010_035.png', '00001722_004.png',
        '00013844_001.png', '00019576_023.png', '00019643_004.png',
        '00018546_006.png', '00012010_026.png', '00012219_007.png',
        '00013916_002.png', '00012543_010.png', '00016561_003.png',
        '00018250_000.png', '00013613_013.png', '00018335_012.png',
        '00019576_060.png', '00013601_022.png', '00013613_016.png',
        '00018329_007.png', '00001249_004.png'],

In [226]:
result = np.empty(0, dtype="str")
for key, item in labeled_list["ssl_al"]["al labels"].items():
    result = np.append(result, item)
result

array(['00012233_001.png', '00001792_003.png', '00015698_001.png',
       '00001722_004.png', '00013443_001.png', '00001470_000.png',
       '00018546_002.png', '00018557_011.png', '00019260_007.png',
       '00013601_003.png', '00012184_002.png', '00012387_001.png',
       '00018829_005.png', '00016052_006.png', '00018019_005.png',
       '00014558_011.png', '00014715_011.png', '00001449_002.png',
       '00016887_000.png', '00019643_007.png', '00019240_005.png',
       '00012010_005.png', '00013073_006.png', '00012010_035.png',
       '00001722_004.png', '00013844_001.png', '00019576_023.png',
       '00019643_004.png', '00018546_006.png', '00012010_026.png',
       '00012219_007.png', '00013916_002.png', '00012543_010.png',
       '00016561_003.png', '00018250_000.png', '00013613_013.png',
       '00018335_012.png', '00019576_060.png', '00013601_022.png',
       '00013613_016.png', '00018329_007.png', '00001249_004.png'],
      dtype='<U16')

In [230]:
df = create_label_df(fullDataset, experts)
df[df["filename"].isin(result)]

Unnamed: 0,filename,gt,prediction_4295349121,prediction_4295342357
21,00001249_004.png,1,1,0
34,00001449_002.png,0,0,0
35,00001470_000.png,0,0,0
64,00001722_004.png,0,0,0
70,00001792_003.png,0,0,0
121,00012010_005.png,1,1,0
135,00012010_026.png,1,1,0
142,00012010_035.png,1,1,0
184,00012184_002.png,0,0,0
192,00012219_007.png,0,0,0


In [231]:
class LabelDataset():
    def __init__(self, data):
        self.y = data["gt"]
        self.x = data.copy().drop(["filename", "gt"], axis=1)

    def __getitem__(self, index: int):
        return np.array(self.x.iloc[index], dtype="float32"), self.y.iloc[index]

    def __len__(self) -> int:
        return len(self.y)

trainLabels = LabelDataset(df[df["filename"].isin(result)])
train_dataloader = DataLoader(datasetLabels, batch_size=8, shuffle=True)

valLabels = LabelDataset(create_label_df(fullDataset, experts))
val_dataloader = DataLoader(datasetLabels, batch_size=16, shuffle=True)

In [246]:
class LabelNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            #nn.Softmax(dim=0)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = LabelNet()#.to(device)

In [251]:
import torch.optim as optim

def train_label_model(model, dataloader):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.7)

    running_loss = 0.0

    for epoch in range(5):
        for i, data in enumerate(dataloader, 0):
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 0 and i != 0:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
                running_loss = 0.0
            
    
    return model

In [252]:
model = LabelNet()
train_label_model(model, train_dataloader)

[1,   101] loss: 0.705
[1,   201] loss: 0.694
[2,   101] loss: 0.726
[2,   201] loss: 0.691
[3,   101] loss: 0.724
[3,   201] loss: 0.688
[4,   101] loss: 0.722
[4,   201] loss: 0.689
[5,   101] loss: 0.720
[5,   201] loss: 0.687


LabelNet(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=2, out_features=4, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4, out_features=4, bias=True)
    (3): ReLU()
    (4): Linear(in_features=4, out_features=2, bias=True)
  )
)

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in val_dataloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = model(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the {total} test images: {100 * correct // total} %')

In [263]:
next(iter(val_dataloader))

AttributeError: 'list' object has no attribute 'iloc'