In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# This Cell generates the training pictures out of the tif files if needed
# For the mini-image generation it is needed to split the images into smaller ones, beacause ram is limited

from operator import eq
from re import I
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import PIL
from ctypes import cdll
from PIL import Image
from openslide import OpenSlide
PIL.Image.MAX_IMAGE_PIXELS = 2331200000
trainingPath: str = '../input/mayo-clinic-strip-ai/train/'
testPath: str = '../input/mayo-clinic-strip-ai/test/'
generatedTrainingDataPath = '../training-data/'  # label/imgId/sliceNumber
trainingDataPathList = ['../training-data/', '../training-data2/', '../training-data3/', '../training-data4/'] # training data will be generated 3 times to get different data for the 3 different models

reGenerateTrainingDataLAA = False
reGenerateTrainingDataCE = False
targetQuantityOfTilesPerFieldLAA = 2200
targetQuantityOfTilesPerFieldCE = 600

trainingMode = False

# load train.csv into pandas dataframe
if trainingMode:
    trainDataMetaTable = pd.read_csv('../input/mayo-clinic-strip-ai/train.csv')


def checkInnerTileVariance(imgNDArray, startX, startY, height, width, numberOfSamples, avgDifferenceNeeded, maxAcceptedEquals, equalsDecrementIfUnequal, equalsIncrementIfEqual, maxAbsoluteDifferenceToBeEquals):
    # get randome pixels in changing quaters in the tileArea and check the difference between the rgb values
    # if the average difference is to small, the image area is not useful (white background for example would not be usefull)
    lastPixelRGBValues = imgNDArray[startX, startY]
    differenceSum = 0
    equalsCount = 0
    quaterChangeInterval = int(numberOfSamples / 4)
    numberOfSamplesDoneCurrentQuater = 0
    halfWidth = int(width / 2)
    halfHeight = int(height / 2)
    startXOffset = 0
    startYOffset = 0
    for i in range(numberOfSamples):
        startXWithOffset = startX + startXOffset
        startYWithOffset = startY + startYOffset
        randomX = np.random.randint(startXWithOffset, startXWithOffset + halfWidth)
        randomY = np.random.randint(startYWithOffset, startYWithOffset + halfHeight)
        currentPixelRGBValues = imgNDArray[randomX, randomY]
        differenceAbsolute = abs(int(currentPixelRGBValues[0]) - int(lastPixelRGBValues[0])) + abs(int(
            currentPixelRGBValues[1]) - int(lastPixelRGBValues[1])) + abs(int(currentPixelRGBValues[2]) - int(lastPixelRGBValues[2]))
        # if the pixels are (almost) the same
        if differenceAbsolute <= maxAbsoluteDifferenceToBeEquals:
            equalsCount += equalsIncrementIfEqual
            if equalsCount > maxAcceptedEquals:
                return False
        else:
            equalsCount -= equalsDecrementIfUnequal
        differenceSum += differenceAbsolute
        lastPixelRGBValues = currentPixelRGBValues
        numberOfSamplesDoneCurrentQuater += 1
        if numberOfSamplesDoneCurrentQuater >= quaterChangeInterval:
            numberOfSamplesDoneCurrentQuater = 0
            if startXOffset == 0:
                startXOffset = halfWidth
            else:
                startXOffset = 0
                if startYOffset == 0:
                    startYOffset = halfHeight
                else:
                    startYOffset = 0
    return differenceSum / numberOfSamples >= avgDifferenceNeeded


def createRandomTilesFromImage(deletePastResults, fieldNumber, imgResultPath, imgNDArray, imgName, numberOfTilesToGenerate, tileWidth, tileHeight,saveTile):
    numberOfTilesGenerated = 0
    if saveTile:
        if deletePastResults:
            # remove all files in the folder
            if os.path.exists(imgResultPath):
                for file in os.listdir(imgResultPath):
                    os.remove(os.path.join(imgResultPath, file))
        if not os.path.exists(imgResultPath):
            os.mkdir(imgResultPath)
    maxNumberOfTriesPerTile = 50
    # if more than 75% of the current 1/9 of the 3x3 grid is useless, than it should not get that much tiles
    maxNumberOfTriesAbsolute = numberOfTilesToGenerate * 4
    numberOfTriesDoneCurrentTile = 0
    numberOfTriesDoneAbsolute = 0
    # the area for the final image is half the tile size (the tile gets reduced and loses some detail information) to get a better overview for the model withouth loosing to much performance while training
    generatedTileWidth = int(tileWidth / 2)
    generatedTileHeight = int(tileHeight / 2)
    generatedTileList = []
    while numberOfTilesGenerated < numberOfTilesToGenerate and numberOfTriesDoneCurrentTile < maxNumberOfTriesPerTile and numberOfTriesDoneAbsolute < maxNumberOfTriesAbsolute:
        # get random coordinates
        startX = np.random.randint(0, imgNDArray.shape[0] - tileWidth)
        startY = np.random.randint(0, imgNDArray.shape[1] - tileHeight)
        numberOfTriesDoneCurrentTile += 1
        numberOfTriesDoneAbsolute += 1
        # check if the tile is usefull
        if checkInnerTileVariance(imgNDArray=imgNDArray, startX=startX, startY=startY, height=tileHeight, width=tileWidth, numberOfSamples=200, avgDifferenceNeeded=128, maxAcceptedEquals=20, equalsDecrementIfUnequal=1, equalsIncrementIfEqual=2, maxAbsoluteDifferenceToBeEquals=8):
            # write every second pixel to a new image
            generatedTile: Image = Image.new(
                'RGB', (generatedTileWidth, generatedTileHeight))
            for x in range(generatedTileWidth):
                for y in range(generatedTileHeight):
                    # put pixel withouth alpha channel
                    currentPixel = imgNDArray[startX + x * 2, startY + y * 2]
                    generatedTile.putpixel(
                        (x, y), (currentPixel[0], currentPixel[1], currentPixel[2]))
            # save the image
            if saveTile:
                generatedTile.save(
                    imgResultPath + 'fieldNumber' + str(fieldNumber)+'_' + 'tileNumber' + str(numberOfTilesGenerated) + '.tif') #used for training
            else:
                generatedTileList.append(np.array(generatedTile)) #used for testing
            numberOfTilesGenerated += 1
            numberOfTriesDoneCurrentTile = 0
    return generatedTileList


def imageSplittingAndTileGeneration(imageId, imageFolder, imgLabel, saveTile, optionalNumberOfTilesToGeneratePerField = 0 , tileListToFill = []):
    # fields are parts of the 3x3 grid the images get split into (this is done due to memory limitations)
    # tiles are randome but usefull parts of the fields (the tiles are the training data or the test data)
    # open the image
    slide = OpenSlide(imageFolder + imageId + '.tif')
    # get the image size
    width, height = slide.dimensions
    # slice the images into 3x3 tiles
    if height < 200:
        return tileListToFill
    if width < 200:
        return tileListToFill
    fieldCount = 0
    numberOfRows = (int)(height / 16000) + 1
    numberOfColumns = (int)(width / 16000) + 1
    numberOfFields = numberOfRows * numberOfColumns
    # create numberOftilesToGenerate depending on targetQuantityOfTilesPerFieldLAA and targetQuantityOfTilesPerFieldCE depends on label
    numberOfTilesToGeneratePerField = 0
    if imgLabel == 'LAA':
        numberOfTilesToGeneratePerField = targetQuantityOfTilesPerFieldLAA
    else:
        numberOfTilesToGeneratePerField = targetQuantityOfTilesPerFieldCE
    if optionalNumberOfTilesToGeneratePerField > 0:
        numberOfTilesToGeneratePerField = optionalNumberOfTilesToGeneratePerField
    # get the tile size
    fieldWidth = int(width / numberOfColumns)
    fieldHeight = int(height / numberOfRows)
    for i in range(0, numberOfRows):
        for j in range(0, numberOfColumns):
            imgName = imageId
            # get the tile
            field = slide.read_region(
                (i * fieldWidth, j * fieldHeight), 0, (fieldWidth, fieldHeight))
            # convert the tile to a numpy array
            field = np.array(field)
            if imgLabel is not None:
                # try generate 300 tiles from the field
                tiles = createRandomTilesFromImage(deletePastResults = (fieldCount == 0), fieldNumber=fieldCount, imgResultPath=generatedTrainingDataPath +
                                           imgLabel + '/' + imgName + '/', imgNDArray=field, imgName=imgName, numberOfTilesToGenerate=numberOfTilesToGeneratePerField, tileWidth=160, tileHeight=160, saveTile=saveTile)
                tileListToFill.extend(tiles)
            fieldCount += 1
            print('fieldCount: ' + str(fieldCount) + ' of ' + str(numberOfFields) + ' for image ' + imageId + ' done')
    return tileListToFill


def generateTrainingData():
    # iterate through all tif files in '../input/train/' and create 300 tiles for each image
    # iterate through the rows of the dataframe
    for index, row in trainDataMetaTable.iterrows():
        # get the image_id
        imageId = row['image_id']
        # get the image_label
        imageLabel = row['label']
        if (imageLabel == 'CE' and not reGenerateTrainingDataCE) or (imageLabel == 'LAA' and not reGenerateTrainingDataLAA):
            continue
        # check if the image exists
        if os.path.exists(trainingPath + imageId + '.tif'):
            imageSplittingAndTileGeneration(imageId, trainingPath, imageLabel, True)
        else:
            print('image ' + imageId + ' does not exist')
        # print the image number done
        print('image ' + str(index) + ' done')

# check if the training data is already generated
if trainingMode:
    for trainingDataPath in trainingDataPathList:
        generatedTestDataPath = trainingDataPath
        numberOfExistingTrainingImageFoldersSufficentCE = False
        numberOfExistingTrainingImageFoldersSufficentLAA = False
        if os.path.exists(generatedTrainingDataPath):
            numberOfExistingTrainingImageFoldersCE = len(
                os.listdir(generatedTrainingDataPath + '/CE/')) 
            numberOfExistingTrainingImageFoldersLAA =  len(os.listdir(generatedTrainingDataPath + '/LAA/'))
            numberOfExistingTrainingImageFoldersSufficentLAA = int(
                numberOfExistingTrainingImageFoldersLAA * 1.1) >= len(trainDataMetaTable[trainDataMetaTable['label'] == 'LAA'])
            numberOfExistingTrainingImageFoldersSufficentCE = int(
                numberOfExistingTrainingImageFoldersCE * 1.1) >= len(trainDataMetaTable[trainDataMetaTable['label'] == 'CE'])
        else:
            os.mkdir(generatedTrainingDataPath)
            os.mkdir(generatedTrainingDataPath + '/CE/')
            os.mkdir(generatedTrainingDataPath + '/LAA/')
        if not numberOfExistingTrainingImageFoldersSufficentLAA:
            reGenerateTrainingDataLAA = True
        if not numberOfExistingTrainingImageFoldersSufficentCE:
            reGenerateTrainingDataCE = True
        if reGenerateTrainingDataLAA or reGenerateTrainingDataCE:
            generateTrainingData()

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [2]:
#this cell creates, loads or trains the basic models wich are used for Monte-Carlo-Predicitions on multiple different randome samples of the image
# import keras
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

basicModelPath = '../input/basicmodels4/'
logLossEvaluationModelPath = '../input/losslogevaluationmodel4/'
useTrainingDataPath =  '../training-data/'
trainingDataFoldersLAA = []
trainingDataFoldersCE = []
trainingDataFoldersContent = {}
numberOfTrainingSamplesPerIteration = 2048
numberOfInputImagesForOneModel = 4
numberOfClasses = 2
CE_INDEX = 0
LAA_INDEX = 1
if trainingMode:
    accumulatedData = np.zeros(
    (numberOfTrainingSamplesPerIteration, numberOfInputImagesForOneModel, 80, 80, 3), dtype=float)
    accumulatedLabels = np.zeros((numberOfTrainingSamplesPerIteration,numberOfClasses), dtype=float)

dataLoadingFolderSuccessMinimumFactorModelList = [0.62, 0.2, 0.4]
dataLoadingFolderSuccessMinimumFactor = 0.60 # if more than 40% of training data folders/images are failing, cause of repeating data, the training is stopped, this protects against overfitting while making efficient use of randomly loaded training data
dataLoadingFolderSuccessInflationFactor = 0.9901
newDataLoadingSuccess = 1.0

#dictionary
pictureUsageCheckDictionary = {} #this dictionary gets filled with picture names as keys and boolean values as values, where True means the picture got already used for the input-picture-number corresponding to the index of the models input pictures.

def createModelType1():
    # create the model
    inputLayer = keras.Input(shape=(numberOfInputImagesForOneModel, 80, 80, 3))
    conv1 = Conv2D(32, kernel_size=(3, 3), strides=(2, 2), activation='relu')(inputLayer)
    conv2 = Conv2D(128, kernel_size=(4, 4), strides=(2, 2), activation='relu')(conv1)
    conv3 = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), activation='relu')(conv2)
    conv4 = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), activation='relu')(conv3)
    flatten = Flatten()(conv4)
    dense1 = Dense(4096, activation='relu')(flatten)
    dense2 = Dense(2048, activation='relu')(dense1)
    dense3 = Dense(1024, activation='relu')(dense2)
    dense4 = Dense(512, activation='relu')(dense3)
    dense5 = Dense(128, activation='relu')(dense4)
    outputLayer = Dense(2, activation='sigmoid')(dense5)
    model = keras.Model(inputs=inputLayer, outputs=outputLayer)
    # compile the model
    model.compile(
        optimizer=Adam(lr=0.00005),
        loss='mse',
        metrics=['accuracy','categorical_accuracy']
    )
    return model # this model is designed to be trained on 4 images at once, the advantage lies in the fact that the model can learn to recognize patterns in the images, which are not present in the single images, but are present in the combination of the images

def createModelType2(): #model2 trys compensate the mistakes of model1a and model1b
    # create convolutional model with functional api
    inputLayer = keras.Input(shape=(numberOfInputImagesForOneModel, 80, 80, 3))
    conv1 = Conv2D(32, kernel_size=(3, 3), strides=(2, 2), activation='relu')(inputLayer)
    conv2 = Conv2D(128, kernel_size=(4, 4), strides=(2, 2), activation='relu')(conv1)
    conv3 = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), activation='relu')(conv2)
    conv4 = Conv2D(128, kernel_size=(3, 3), strides=(2, 2), activation='relu')(conv3)
    flatten = Flatten()(conv4)
    #add concat additional feature with 512 inputs to flatten
    inputLayer2 = keras.Input(shape=(4)) #the last layer of model1a.h5 and model1b.h5 are concatenated to this layer
    concat = keras.layers.concatenate([flatten, inputLayer2])
    dense1 = Dense(5000, activation='relu')(concat) 
    dense2 = Dense(2500, activation='relu')(dense1)
    dense3 = Dense(1250, activation='relu')(dense2)
    dense4 = Dense(1250, activation='relu')(dense3)
    dense5 = Dense(512, activation='relu')(dense4)
    dense6 = Dense(128, activation='relu')(dense5)
    outputLayer = Dense(2, activation='sigmoid')(dense6)
    model = keras.Model(inputs=[inputLayer, inputLayer2], outputs=outputLayer)
    # compile the model
    model.compile(
        optimizer=Adam(lr=0.00005),
        loss='mse',
        metrics=['accuracy','categorical_accuracy']
    )
    return model # this model is designed to be trained on 4 images at once, the advantage lies in the fact that the model can learn to recognize patterns in the images, which are not present in the single images, but are present in the combination of the images
    

def loadTrainingDataFolders(trainingDataPath):
    global trainingDataFoldersLAA
    global trainingDataFoldersCE
    global trainingDataFoldersContent
    trainingDataFoldersLAA.clear()
    trainingDataFoldersCE.clear()

    for folder in os.listdir(trainingDataPath + '/CE/'):
        if len(os.listdir(trainingDataPath + '/CE/' + folder)) < numberOfInputImagesForOneModel:
            continue
        trainingDataFoldersCE.append(trainingDataPath + 'CE/' + folder + '/')
    for folder in os.listdir(trainingDataPath + '/LAA/'):
        # check if the folder is empty
        if len(os.listdir(trainingDataPath + '/LAA/' + folder)) < numberOfInputImagesForOneModel:
            continue
        trainingDataFoldersLAA.append(trainingDataPath + 'LAA/' + folder + '/')

    # load the content paths of the training data folders
    for folder in trainingDataFoldersLAA:
        trainingDataFoldersContent[folder] = []
        for file in os.listdir(folder):
            trainingDataFoldersContent[folder].append(folder + file)
    for folder in trainingDataFoldersCE:
        trainingDataFoldersContent[folder] = []
        for file in os.listdir(folder):
            trainingDataFoldersContent[folder].append(folder + file)
    

def loadTrainingData():
    global accumulatedData
    global accumulatedLabels
    global pictureUsageCheckDictionary
    global newDataLoadingSuccess
    nextIsCE = True
    maxNumberOfInFolderFails = 3
    newDataLoadingSuccess = 1.0
    dataLoadingFolderAdditionOnSuccess = 0.01
    # load the data
    # iterate through the rows of the dataframe
    i = 0
    while i < numberOfTrainingSamplesPerIteration:
        # choose random image folder
        if nextIsCE:
            folder = random.choice(trainingDataFoldersCE)
        else:
            folder = random.choice(trainingDataFoldersLAA)
        numberOfFailsCounter = 0
        inputImageNumber = 0
        folderFailed = False
        #get 4 random images from the folder
        while inputImageNumber < numberOfInputImagesForOneModel:
            #choose random image
            imagePath = random.choice(trainingDataFoldersContent[folder])
            usageList = pictureUsageCheckDictionary.get(imagePath)
            if usageList == None:
                usageList = [False] * numberOfInputImagesForOneModel
                pictureUsageCheckDictionary[imagePath] = usageList
            if not usageList[inputImageNumber]:
                usageList[inputImageNumber] = True
                # use PIL to load the image
                img = Image.open(imagePath)
                # load the image into a numpy array
                img = np.array(img, dtype=float)
                #normalize the image
                img *= 1.0 / 255.0
                # add the image to the accumulated data
                accumulatedData[i][inputImageNumber] = img
                inputImageNumber += 1
            else:
                numberOfFailsCounter += 1
                if numberOfFailsCounter > maxNumberOfInFolderFails:
                    folderFailed = True
                    break
        if not folderFailed:
            # add the label to the labels
            if nextIsCE:
                accumulatedLabels[i][CE_INDEX] = 1.0
                accumulatedLabels[i][LAA_INDEX] = 0.0
            else:
                accumulatedLabels[i][LAA_INDEX] = 1.0
                accumulatedLabels[i][CE_INDEX] = 0.0
            nextIsCE = not nextIsCE
            i += 1
            newDataLoadingSuccess += dataLoadingFolderAdditionOnSuccess
        newDataLoadingSuccess *= dataLoadingFolderSuccessInflationFactor
        if newDataLoadingSuccess < dataLoadingFolderSuccessMinimumFactor:
            print("Not enough new training data, stopping training")
            break
    return newDataLoadingSuccess >= dataLoadingFolderSuccessMinimumFactor

def trainModelType1():
    # create the model
    model = createModelType1()
    # load the training data folders
    loadTrainingDataFolders(useTrainingDataPath)
    # train the model
    iterationCount = 0
    while loadTrainingData():
        model.fit(accumulatedData, accumulatedLabels, epochs=1, batch_size=16)
        iterationCount += 1
        # print count and successweight
        print("Iteration: " + str(iterationCount) + " new data loading success: " + str(newDataLoadingSuccess))
    return model

def trainModelType2(model1a, model1b):
    # create the model
    model = createModelType2()
    # load the training data folders
    loadTrainingDataFolders(useTrainingDataPath)
    # train the model
    iterationCount = 0
    while loadTrainingData():
        # get the outputs of the models
        model1aOutput = model1a.predict(accumulatedData)
        model1bOutput = model1b.predict(accumulatedData)
        # concatenate the outputs
        model2Input = np.concatenate((model1aOutput, model1bOutput), axis=1)
        # train the model
        model.fit([accumulatedData, model2Input], accumulatedLabels, epochs=1, batch_size=16)
        iterationCount += 1
        # print count and successweight
        print("Iteration: " + str(iterationCount) + " new data loading success: " + str(newDataLoadingSuccess))
    return model

def loadModel(modelPath):
    # load the model
    model = keras.models.load_model(modelPath)
    return model


# check if model1a.h5 exists, load it else train it
if os.path.isfile(basicModelPath + 'model1a.h5'):
    model1a = loadModel(basicModelPath + 'model1a.h5')
elif trainingMode:
    useTrainingDataPath = trainingDataPathList[0]
    dataLoadingFolderSuccessMinimumFactor = dataLoadingFolderSuccessMinimumFactorModelList[0]
    model1a = trainModelType1()
    model1a.save('model1a.h5')
else:
    print("No model1a.h5 found, stopping")
    exit()
# check if model1b.h5 exists, load it else train it
if os.path.isfile(basicModelPath + 'model1b.h5'):
    model1b = loadModel(basicModelPath + 'model1b.h5')
elif trainingMode:
    useTrainingDataPath = trainingDataPathList[1]
    dataLoadingFolderSuccessMinimumFactor = dataLoadingFolderSuccessMinimumFactorModelList[1]
    model1b = trainModelType1()
    model1b.save('model1b.h5')
else:
    print("No model1b.h5 found, stopping")
    exit()
# check if model2.h5 exists, load it else train it
if os.path.isfile(basicModelPath + 'model2.h5'):
    model2 = loadModel(basicModelPath + 'model2.h5')
elif trainingMode:
    useTrainingDataPath = trainingDataPathList[2]
    dataLoadingFolderSuccessMinimumFactor = dataLoadingFolderSuccessMinimumFactorModelList[2]
    model2 = trainModelType2(model1a, model1b)
    model2.save('model2.h5')
else:
    print("No model2.h5 found, stopping")
    exit()



2022-10-05 13:27:45.475245: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [3]:
#this cell executes the montecarlo samples and trains the final losslogevaluationmodel, which gets the the results of the montecarlo-predictions as input
from threading import Thread
from time import sleep

class ImageProcessingThread(Thread):
    def __init__(self,imageId, evaluationFolder, numberOfSamplesPerImageField):
        Thread.__init__(self)
        self.imageId = imageId
        self.evaluationFolder = evaluationFolder
        self.numberOfSamplesPerImageField = numberOfSamplesPerImageField
        self.tileList = []

    def run(self):
        self.tileList = imageSplittingAndTileGeneration(self.imageId, self.evaluationFolder , 'unknown', False, self.numberOfSamplesPerImageField)

#create list of evaluations
evaluations = {} # key: image_id, value: output of [model1a, model1b, model2] times number of evaluations per image
#create list of random image tiles
randomImageTiles = []
numberOfSamplesPerImageField = 250
evaluationsPerImage = 48

if trainingMode:
    evaluationFolder = trainingDataPathList[3]
    dataMetaTable = pd.read_csv('../input/mayo-clinic-strip-ai/train.csv')
    numberOfTrainingSamples = 128000
    accumulatedData = np.zeros(
    (numberOfTrainingSamples, evaluationsPerImage * 6), dtype=float)
    accumulatedLabels = np.zeros((numberOfTrainingSamples,numberOfClasses), dtype=float)
else:
    evaluationFolder = '../input/mayo-clinic-strip-ai/test/'
    dataMetaTable = pd.read_csv('../input/mayo-clinic-strip-ai/test.csv')
    #evaluationFolder = '../input/mayo-clinic-strip-ai/train/'
    #dataMetaTable = pd.read_csv('../input/testfile2/test2.csv')

def processImagesForEvaluation(): #each picture gets evaluated on the fly, results get saved in "evaluations", later the evaluations get combined in a final log-loss/cross-entropy model to a final result
    # iterate through the rows of the dataframe
    # ToDo refactor redundant code
    thread1 = None
    thread2 = None
    thread3 = None
    # added some multithreading experimantally to increase speed from 15 hours to under 9 hours on kaggle server
    for index, row in dataMetaTable.iterrows():
        # get the image id
        imageId = row['image_id']
        if os.path.exists(evaluationFolder + imageId + '.tif'):
            #get file size in byte
            fileSize = os.path.getsize(evaluationFolder + imageId + '.tif')
            currentImageGotThread = False
            while not currentImageGotThread:
                if thread1 == None:
                    thread1 = ImageProcessingThread(imageId, evaluationFolder, numberOfSamplesPerImageField)
                    currentImageGotThread = True
                    thread1.start()
                    continue #go to next image
                if thread2 == None:
                    thread2 = ImageProcessingThread(imageId, evaluationFolder, numberOfSamplesPerImageField)
                    currentImageGotThread = True
                    thread2.start()
                    continue #go to next image
                if thread3 == None:
                    thread3 = ImageProcessingThread(imageId, evaluationFolder, numberOfSamplesPerImageField)
                    currentImageGotThread = True
                    thread3.start()
                    continue
                # wait for one thread to finish
                while True:
                    sleep(0.1)
                    if thread1 == None or (not thread1.isAlive()):
                        break
                    if thread2 == None or (not thread2.isAlive()):
                        break
                    if thread3 == None or (not thread3.isAlive()):
                        break
                if thread1 != None and (not thread1.isAlive()):
                    predictAndWriteResult(thread1)
                    thread1 = None
                    continue
                if thread2 != None and (not thread2.isAlive()):
                    predictAndWriteResult(thread2)
                    thread2 = None
                    continue
                if thread3 != None and (not thread3.isAlive()):
                    predictAndWriteResult(thread3)
                    thread3 = None
                    continue
        else:
            print('image ' + imageId + ' does not exist')
    # wait for the last thread to finish
    while (thread1 != None and thread1.isAlive()) or (thread2 != None and thread2.isAlive() or (thread3 != None and thread3.isAlive())):
        sleep(0.1)
    if thread1 != None:
        predictAndWriteResult(thread1)
        thread1 = None
    if thread2 != None:
        predictAndWriteResult(thread2)
        thread2 = None
    if thread3 != None:
        predictAndWriteResult(thread3)
        thread3 = None

def predictAndWriteResult(threadObject):
    tileList = threadObject.tileList
    imageId = threadObject.imageId
    if len(tileList) > 0:
        tilesToEvaluate = readRandomSamplesFromTileListForModel(tileList, evaluationsPerImage)
        evaluations[imageId] = predictTilesWithModels(tilesToEvaluate)
    print('image ' + imageId + ' done!')

def readRandomSamplesFromTileListForModel(tileList, numberOfSamples):
    global randomImageTiles
    randomImageTiles.clear()
    for i in range(numberOfSamples):
        for j in range(numberOfInputImagesForOneModel):
            randomImageTiles.append(tileList[random.randint(0, len(tileList) - 1)])
    return randomImageTiles #returns a charge of image tiles to evaluate the image

def predictTilesWithModels(listOfNDImagesToEvaluate):
    global evaluations
    #convert the list of numpy arrays to numpy array
    listOfNDImagesToEvaluate = np.array(listOfNDImagesToEvaluate, dtype=float)
    #get the ndarray with shape (numberOfSamples, 80, 80, 3) into the shape (numberOfSamples/numberOfInputImagesForOneModel,numberOfInputImagesForOneModel, 80, 80, 3)
    listOfNDImagesToEvaluate = listOfNDImagesToEvaluate.reshape((int(len(listOfNDImagesToEvaluate)/numberOfInputImagesForOneModel), numberOfInputImagesForOneModel, 80, 80, 3))
    #normalize the data 255
    listOfNDImagesToEvaluate *= 1.0 / 255.0
    # predict the tiles with the models
    model1aOutput = model1a.predict(listOfNDImagesToEvaluate)
    model1bOutput = model1b.predict(listOfNDImagesToEvaluate)
    model2Input = np.concatenate((model1aOutput, model1bOutput), axis=1)
    model2Output = model2.predict([listOfNDImagesToEvaluate, model2Input]).flatten()
    model1aOutput = model1aOutput.flatten()
    model1bOutput = model1bOutput.flatten()
    #make outputs one dimensional
    return np.concatenate((model1aOutput, model1bOutput, model2Output), axis=0)

def loadTrainingData(): #creats and loads the montecarlo training data into accumulatedData and accumulatedLabels
    global evaluationFolder
    global accumulatedData
    global accumulatedLabels
    global numberOfTrainingSamples
    loadTrainingDataFolders(evaluationFolder)
    # load the data
    # iterate through the rows of the dataframe
    i = 0
    for i in range(numberOfTrainingSamples):
        # choose random image folder
        nextIsCE = random.randint(0, len(trainingDataFoldersCE) + len(trainingDataFoldersLAA)) < len(trainingDataFoldersCE)
        if nextIsCE:
            folder = random.choice(trainingDataFoldersCE)
        else:
            folder = random.choice(trainingDataFoldersLAA)
        tileList = [] #the list will contain NDArrays of the tiles
        for k in range(evaluationsPerImage):
            #choose random image
            for j in range(numberOfInputImagesForOneModel):
                imagePath = random.choice(trainingDataFoldersContent[folder])
                #load the image into the tileList using PIL
                image = Image.open(imagePath)
                image = np.array(image)
                tileList.append(image)
        predictions = predictTilesWithModels(tileList)
        accumulatedData[i] = predictions
        if nextIsCE:
            accumulatedLabels[i] = [1.0, 0.0]
        else:
            accumulatedLabels[i] = [0.0, 1.0]
        if i % 4000 == 3999:
            #save
            np.save('accumulatedData.npy', accumulatedData)
            np.save('accumulatedLabels.npy', accumulatedLabels)
            print('loaded ' + str(i + 1) + ' training samples')
            
def createFinalEvaluationLossLogModel(): #create a model that combines the evaluations of the models
    # create the model using functional Keras API
    inputLayer = keras.Input(shape=(evaluationsPerImage * 6,))
    layer1 = keras.layers.Dense(576, activation='relu')(inputLayer)
    layer2 = keras.layers.Dense(1152, activation='relu')(layer1)
    layer7 = keras.layers.Dense(576, activation='relu')(layer2)
    layer8 = keras.layers.Dense(144, activation='relu')(layer7)
    outputLayer = keras.layers.Dense(numberOfClasses, activation='softmax')(layer8)
    model = keras.Model(inputs=inputLayer, outputs=outputLayer)
    model.compile(
        optimizer=Adam(lr=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy', 'categorical_crossentropy', 'categorical_accuracy']
    )
    return model

def trainFinalEvaluationModel():
    global accumulatedData
    global accumulatedLabels
    # load the training data
    #check if the model exists, if not train it
    if os.path.isfile('finalLossLogEvaluationModel.h5'):
        return
    #check if accumulatedData.npy exists, if not load the data
    if os.path.isfile('accumulatedData.npy'):
        accumulatedData = np.load('accumulatedData.npy')
        accumulatedLabels = np.load('accumulatedLabels.npy')
        #reduce to numberOfSamples
        accumulatedData = accumulatedData[:numberOfTrainingSamples]
        accumulatedLabels = accumulatedLabels[:numberOfTrainingSamples]
    else:
        loadTrainingData()
        np.save('accumulatedData.npy', accumulatedData)
        np.save('accumulatedLabels.npy', accumulatedLabels)
    #create the model
    finalEvaluationModel = createFinalEvaluationLossLogModel()
    # train the final evaluation model
    finalEvaluationModel.fit(accumulatedData, accumulatedLabels, epochs=16, batch_size=32) # after 16 epochs overfitting starts
    # save the final evaluation model
    finalEvaluationModel.save('finalLossLogEvaluationModel.h5')
    

if not trainingMode:
    processImagesForEvaluation()
else:
    trainFinalEvaluationModel()


fieldCount: 1 of 2 for image 008e5c_0 done
fieldCount: 2 of 2 for image 008e5c_0 done


2022-10-05 13:28:17.326737: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


image 008e5c_0 done!
fieldCount: 1 of 12 for image 006388_0 done
fieldCount: 1 of 4 for image 00c058_0 done
fieldCount: 2 of 4 for image 00c058_0 done
fieldCount: 3 of 4 for image 00c058_0 done
fieldCount: 4 of 4 for image 00c058_0 done
fieldCount: 2 of 12 for image 006388_0 done
fieldCount: 1 of 8 for image 01adc5_0 done
fieldCount: 2 of 8 for image 01adc5_0 done
fieldCount: 3 of 12 for image 006388_0 done
fieldCount: 3 of 8 for image 01adc5_0 done
fieldCount: 4 of 8 for image 01adc5_0 done
fieldCount: 5 of 8 for image 01adc5_0 done
fieldCount: 4 of 12 for image 006388_0 done
fieldCount: 6 of 8 for image 01adc5_0 done
fieldCount: 7 of 8 for image 01adc5_0 done
fieldCount: 8 of 8 for image 01adc5_0 done
fieldCount: 5 of 12 for image 006388_0 done
fieldCount: 6 of 12 for image 006388_0 done
fieldCount: 7 of 12 for image 006388_0 done
fieldCount: 8 of 12 for image 006388_0 done
fieldCount: 9 of 12 for image 006388_0 done
fieldCount: 10 of 12 for image 006388_0 done
fieldCount: 11 of 12 f

In [4]:
'''#test finalLossLogEvaluationModel.h5 on accumulatedDataTest.npy and accumulatedLabelsTest.npy
#load the test data
accumulatedDataTest = np.load('accumulatedDataTest.npy')
accumulatedLabelsTest = np.load('accumulatedLabelsTest.npy')
#load the model
finalEvaluationModel = keras.models.load_model('finalLossLogEvaluationModel.h5')
#evaluate the model
finalEvaluationModel.evaluate(accumulatedDataTest, accumulatedLabelsTest, batch_size=32, verbose=1)
result = finalEvaluationModel.predict(accumulatedDataTest[:50])
for i in range(50):
    print('label: ' + str(accumulatedLabelsTest[i]) + ' prediction: ' + str(result[i]))'''
if not trainingMode:
    finalEvaluationModel = keras.models.load_model(logLossEvaluationModelPath + 'finalLossLogEvaluationModel.h5')
    #dictionary to convert imageId to patientId
    imageIdToPatientId = {}
    #dictionary to convert patientId to list of score with number of images like [CE score, LAA score, numberOfImages]
    patientIdToScore = {}
    CE_INDEX = 0
    LAA_INDEX = 1
    NUMBER_OF_IMAGES_INDEX = 2
     #iterate through the rows of the dataframe and fill the dictionaries
    for index, row in dataMetaTable.iterrows():
        imageId = row['image_id']
        if row['patient_id'] not in patientIdToScore:
            patientIdToScore[row['patient_id']] = [0.0, 0.0, 0]
        # check if the dictionary "evaluations" is not None for image id
        if imageId in evaluations:
            toPredict = evaluations[imageId].reshape(1, evaluationsPerImage * 6)
            # predict the image
            result = finalEvaluationModel.predict(toPredict)
            currentScore = patientIdToScore[row['patient_id']]
            currentScore[CE_INDEX] += result[0][CE_INDEX]
            currentScore[LAA_INDEX] += result[0][LAA_INDEX]
            currentScore[NUMBER_OF_IMAGES_INDEX] += 1

    #iterate through the patientIdToScore dictionary and calculate the average score
    patientIdGotScore = {}
    for index, row in dataMetaTable.iterrows():
        patientId = row['patient_id']
        currentScore = patientIdToScore[patientId]
        if patientId not in patientIdGotScore:
            if currentScore[NUMBER_OF_IMAGES_INDEX] != 0:
                currentScore[CE_INDEX] /= currentScore[NUMBER_OF_IMAGES_INDEX]
                currentScore[LAA_INDEX] /= currentScore[NUMBER_OF_IMAGES_INDEX]
            else:
                currentScore[CE_INDEX] = 0.7 #if no information, CE is statistical more likely
                currentScore[LAA_INDEX] = 0.3
            patientIdGotScore[patientId] = True
    
    submission = pd.DataFrame(columns=['patient_id', 'CE', 'LAA'])
    for patientId in patientIdToScore:
        submission = submission.append({'patient_id': patientId, 'CE': patientIdToScore[patientId][CE_INDEX], 'LAA': patientIdToScore[patientId][LAA_INDEX]}, ignore_index=True)
    submission.to_csv('submission.csv', index=False)
    print('submission created!')

submission created!
