In [None]:
import os
import numpy as np
from shutil import copyfile
from keras import Input, layers, backend, Model, losses, datasets, models, metrics, optimizers, initializers
from keras.utils import Sequence
import tensorflow as tf
import math
import matplotlib.pyplot as plt

Path = "/media/ug-ml/Samsung_T5/Classification/Classification36/Data" #Folder containing All of the data

In [None]:
def LoadLACBED(Path):
    All_Paths = []
    CrystalsUsed = []
    NumberCrystal = 0
    Path_i = sorted(os.listdir(Path)) #Training Validation and Test
    for i in Path_i: #i All data
        CrystalsUsed.append(int(i))
        NumberCrystal+=1
        patternFile = os.path.join(Path, str(i)+"/5.npy")
        #InputFile = Path + "/" + i +"/" + "Input.npy"
        #OutputFile = Path + "/" + i +"/" + "Output.npy"
        All_Paths.append(patternFile)
    
    LACBED_Images = np.zeros((NumberCrystal) * 128 * 128, dtype = np.float32).reshape((NumberCrystal), 128, 128)
    
    
    for i in range(0, len(All_Paths)):
        img = np.load(All_Paths[i])[0].astype(np.float32)
        print(i, img.shape)
        LACBED_Images[i] = img
    return(LACBED_Images, All_Paths, CrystalsUsed)




In [None]:
LACBED_Images, All_Paths, CrystalsUsed = LoadLACBED(Path)

In [None]:
def LogAllImages(UnitCell_Images): #Log all pixel values of the unit cell, the next function calculates log loss
    LogValues = np.log(1 + UnitCell_Images)
    return(LogValues)


def LogLossImages(LogValues):
    NumberImages = len(LogValues)
    LossPairs = np.zeros(NumberImages * NumberImages, dtype = np.float32).reshape(NumberImages, NumberImages)
    for i in range(0, NumberImages):
        for j in range(0, NumberImages):
            if(i < j):
                LossPairs[i][j] = (np.sum(np.square(LogValues[i] - LogValues[j])))
                LossPairs[j][i] = LossPairs[i][j]
        print(i)
    return(LossPairs)
    
    
def KeepCrystalImages(LossPairs, MLL_crit): #Remove a crystal if it is below a Mean Log Loss critial value
    NumberImages = len(LossPairs)
    NumberImagesPerCrystal = len(LossPairs[0])
    
    KeepCrystals = np.ones(NumberImages, dtype = np.int).reshape(NumberImages)
    
    discard_crystals = []

    for i in range(0, NumberImages):
        for j in range(0, NumberImages):
            if(i < j and KeepCrystals[j] == 1 and LossPairs[i][j] < MLL_crit):
                KeepCrystals[j] = 0
        print(i)
    
    NumberCrystalsToKeep = 0
    c = 0
    for i in KeepCrystals:
        if(i == 1):
            NumberCrystalsToKeep+=1 
        else:
            discard_crystals.append(CrystalsUsed[c])
        c += 1
    
    return(discard_crystals, KeepCrystals, NumberCrystalsToKeep)
                                   
                                   
def ShuffleIndexCreator(NumberCrystalsToUse):
    rng = np.random.default_rng()
    ShuffleIndex = np.arange(NumberCrystalsToUse, dtype = np.int)
    rng.shuffle(ShuffleIndex)
    return(ShuffleIndex)



def CreateNewDataPaths(ShuffleIndex, CrystalsUsed, KeepCrystals, NumberCrystalsToKeep, Path, RatioInSets):
    NewNumberTraining = int(NumberCrystalsToKeep * RatioInSets[0])
    NewNumberValidation = int(NumberCrystalsToKeep * RatioInSets[1])
    NewNumberTest = NumberCrystalsToKeep - NewNumberValidation - NewNumberTraining
    
    
    NewTrainingPathsInput = []
    NewValidationPathsInput = []
    NewTestPathsInput = []
    
    NewTrainingPathsOutput = []
    NewValidationPathsOutput = []
    NewTestPathsOutput = []
    
    index_i = 0
    for i in range(0, len(KeepCrystals)):
        if(KeepCrystals[i] == 1):
            Crystal_i = CrystalsUsed[i]
            BasePath = Path + "/" + str(Crystal_i) + "/"

            PathInput = BasePath + "Input.npy"
            PathOutput = BasePath + "Output.npy"
            
            if(ShuffleIndex[index_i] < NewNumberTraining):
                NewTrainingPathsInput.append(PathInput)  
                NewTrainingPathsOutput.append(PathOutput)
            elif(ShuffleIndex[index_i] < NewNumberTraining + NewNumberValidation):
                NewValidationPathsInput.append(PathInput)
                NewValidationPathsOutput.append(PathOutput)
            else:
                NewTestPathsInput.append(PathInput)
                NewTestPathsOutput.append(PathOutput)
                    
            index_i+=1
    TrainingPaths = [NewTrainingPathsInput, NewTrainingPathsOutput]
    ValidationPaths = [NewValidationPathsInput, NewValidationPathsOutput]
    TestPaths = [NewTestPathsInput, NewTestPathsOutput]
    
    NumberInSet = [NewNumberTraining, NewNumberValidation, NewNumberTest]
    return(TrainingPaths, ValidationPaths, TestPaths, NumberInSet)



def LoadNewUnitCell(TrainingPaths, ValidationPaths, TestPaths, NumberInSet):
    #TrainingPaths = [[All training inputs], [All training outputs]]

    NewTrainingImages = np.zeros(NumberInSet[0] * 128 * 128, dtype = np.float32).reshape(NumberInSet[0], 128, 128)
    NewValidationImages = np.zeros(NumberInSet[1] * 128 * 128, dtype = np.float32).reshape(NumberInSet[1], 128, 128)
    NewTestImages = np.zeros(NumberInSet[2] * 128 * 128, dtype = np.float32).reshape(NumberInSet[2], 128, 128)
    
    for i in range(0, NumberInSet[0]):
        NewTrainingImages[i] = np.load(TrainingPaths[0][i]).astype(np.float32)
        
    for i in range(0, NumberInSet[1]):
        NewValidationImages[i] = np.load(ValidationPaths[0][i]).astype(np.float32)
    
    for i in range(0, NumberInSet[2]):
        NewTestImages[i] = np.load(TestPaths[0][i]).astype(np.float32)
    
    
    AllNewImages = [NewTrainingImages, NewValidationImages, NewTestImages]
    
    return(AllNewImages)
    


    
def LogAllNewImages(AllNewImages): #Comparison done by Mean log loss
    LogValuesTraining = LogAllImages(AllNewImages[0])
    LogValuesValidation = LogAllImages(AllNewImages[1])
    LogValuesTest = LogAllImages(AllNewImages[2])
    
    LogAllUnitCells = [LogValuesTraining, LogValuesValidation, LogValuesTest]
    return(LogAllUnitCells)
    
    
    
def LogLossNewImages(LogAllUnitCells):
    #with All_Images = [Train_Images, Validation_Images, Test_Images]
    DataSetSize = [len(LogAllUnitCells[0]), len(LogAllUnitCells[1]), len(LogAllUnitCells[2])]
    TrainValidationPairs = np.zeros(DataSetSize[0] * DataSetSize[1], dtype = np.float32).reshape(DataSetSize[0], DataSetSize[1])
    TrainTestPairs = np.zeros(DataSetSize[0] * DataSetSize[2], dtype = np.float32).reshape(DataSetSize[0], DataSetSize[2])
    
    for i in range(0, DataSetSize[0]):
        print("1: ", i)
        for j in range(0, DataSetSize[1]):
            TrainValidationPairs[i][j] = np.sum(np.square(LogAllUnitCells[0][i] - LogAllUnitCells[1][j]))
            
    for i in range(0, DataSetSize[0]):
        print("2: ", i)
        for j in range(0, DataSetSize[2]):
            TrainTestPairs[i][j] = np.sum(np.square(LogAllUnitCells[0][i] - LogAllUnitCells[2][j]))
    
    return(TrainValidationPairs, TrainTestPairs, DataSetSize)


def FindBestPairs(TrainValidationPairs, TrainTestPairs, DataSetSize):  
    BestPairTrainValidation = np.zeros(DataSetSize[1], dtype = np.int)
    BestPairTrainTest = np.zeros(DataSetSize[2], dtype = np.int)
    
    for i in range(0, DataSetSize[1]):
        print("3: ", i)
        min_val = np.inf
        for j in range(0, DataSetSize[0]):
            if(TrainValidationPairs[j][i] < min_val):
                BestPairTrainValidation[i] = j
                min_val = TrainValidationPairs[j][i]
                
    for i in range(0, DataSetSize[2]):
        print("4: ", i)
        min_val = np.inf
        for j in range(0, DataSetSize[0]):
            if(TrainTestPairs[j][i] < min_val):
                BestPairTrainTest[i] = j
                min_val = TrainTestPairs[j][i]
    return(BestPairTrainValidation, BestPairTrainTest)


#LimitNumberImages = 1
def BestPairLoss(TrainingPaths, ValidationPaths, TestPaths, BestPairTrainValidation, BestPairTrainTest):
    Val_Loss_Sum = 0
    Test_Loss_Sum = 0
    
    Val_Loss_List = np.zeros(len(BestPairTrainValidation), dtype = np.float32)
    Test_Loss_List = np.zeros(len(BestPairTrainTest), dtype = np.float32)
    for i in range(0, len(BestPairTrainValidation)):
        Image1 = np.load(TrainingPaths[1][BestPairTrainValidation[i]])#[:,:,0:LimitNumberImages]
        Image2 = np.load(ValidationPaths[1][i])#[:,:,0:LimitNumberImages]
        #loss = MeanSquareLogError(Image1, Image2)
        loss = ZMCC(Image1, Image2)
        Val_Loss_Sum+=loss
        Val_Loss_List[i] = loss
        print("1: ", i)
        
    for i in range(0, len(BestPairTrainTest)):
        Image1 = np.load(TrainingPaths[1][BestPairTrainTest[i]])#[:,:,0:LimitNumberImages]
        Image2 = np.load(TestPaths[1][i])#[:,:,0:LimitNumberImages]
        #loss = MeanSquareLogError(Image1, Image2)
        loss = ZMCC(Image1, Image2)
        Test_Loss_Sum+=loss
        Test_Loss_List[i] = loss
        print("2: ", i)
        
    Val_Loss = Val_Loss_Sum / len(BestPairTrainValidation)
    Test_Loss = Test_Loss_Sum / len(BestPairTrainTest)
    return(Val_Loss, Test_Loss, Val_Loss_List, Test_Loss_List)

def MeanSquareLogError(Image_1, Image_2):
    msle = np.sum((np.log(1+Image_1) - np.log(1+Image_2)) ** 2)
    return(msle)
    
def WritePaths(Paths, File):
    for i in Paths:
        File.write(i + "\n")
    return

def ZMCC(Image1, Image2):
    sd1 = np.std(Image1)
    mean1 = np.mean(Image1)
    
    sd2 = np.std(Image2)
    mean2 = np.mean(Image2)
    
    zmcc = (1 / (128 * 128 * sd1 * sd2)) * np.sum((Image1 - mean1) * (Image2 - mean2))
    return(zmcc)

In [None]:
LogValues = LogAllImages(LACBED_Images)

In [None]:
LossPairs = LogLossImages(LogValues)

In [None]:
MLL_crit = 1
discard_crystals, KeepCrystals, NumberCrystalsToKeep = KeepCrystalImages(LossPairs, MLL_crit)
print(len(LossPairs), NumberCrystalsToKeep)

In [None]:
print(discard_crystals)
dc = np.array(discard_crystals)
np.save("discard_1.npy", dc)

In [None]:
ShuffleIndex = ShuffleIndexCreator(NumberCrystalsToKeep)

In [None]:
RatioInSets = [0.85, 0.1, 0.05]
TrainingPaths, ValidationPaths, TestPaths, NumberInSet = CreateNewDataPaths(ShuffleIndex, CrystalsUsed, KeepCrystals, NumberCrystalsToKeep, Path, RatioInSets)

In [None]:
NewPath = "/home/ug-ml/felix-ML/classification/Data/FilePaths"
Name = "_1"

TrainingFileInput  = open(NewPath +"/TrainingInput" + Name + ".txt", "w")
ValidationFileInput  = open(NewPath +"/ValidationInput" + Name + ".txt", "w")
TestFileInput  = open(NewPath +"/TestInput" + Name + ".txt", "w")

TrainingFileOutput  = open(NewPath +"/TrainingOutput" + Name + ".txt", "w")
ValidationFileOutput  = open(NewPath +"/ValidationOutput" + Name + ".txt", "w")
TestFileOutput  = open(NewPath +"/TestOutput" + Name + ".txt", "w")

WritePaths(sorted(TrainingPaths[0]), TrainingFileInput)
WritePaths(sorted(ValidationPaths[0]), ValidationFileInput)
WritePaths(sorted(TestPaths[0]), TestFileInput)

WritePaths(sorted(TrainingPaths[1]), TrainingFileOutput)
WritePaths(sorted(ValidationPaths[1]), ValidationFileOutput)
WritePaths(sorted(TestPaths[1]), TestFileOutput)

TrainingFileInput.close()
ValidationFileInput.close()
TestFileInput.close()

TrainingFileOutput.close()
ValidationFileOutput.close()
TestFileOutput.close()

In [None]:
def gen_paths_fromfile(Path):
    Paths = []
    with open(Path) as textFile:
        lines = [line.split() for line in textFile]
    for i in lines:
        Paths.append(i[0])
        
    Paths = np.array(Paths, dtype = "object")
    return(Paths)

data_path = "/home/ug-ml/felix-ML/VAE_000/DataAllInOne_Normalised/VAE_000_2/FilePaths/"

TrainingPathsInput = gen_paths_fromfile(data_path + "TrainingInput_0point1.txt")
TrainingPathsOutput = gen_paths_fromfile(data_path + "TrainingOutput_0point1.txt")

ValidationPathsInput = gen_paths_fromfile(data_path + "ValidationInput_0point1.txt")
ValidationPathsOutput = gen_paths_fromfile(data_path + "ValidationOutput_0point1.txt")

TestPathsInput = gen_paths_fromfile(data_path + "TestInput_0point1.txt")
TestPathsOutput = gen_paths_fromfile(data_path + "TestOutput_0point1.txt")

NewTrainingPaths = [TrainingPathsInput, TrainingPathsOutput]
NewValidationPaths = [ValidationPathsInput, ValidationPathsOutput]
NewTestPaths = [TestPathsInput, TestPathsOutput]

In [None]:
NumberInSet = [len(NewTrainingPaths[0]), len(NewValidationPaths[0]), len(NewTestPaths[0])]
AllNewImages = LoadNewUnitCell(NewTrainingPaths, NewValidationPaths, NewTestPaths, NumberInSet)

In [None]:
LogAllUnitCells = LogAllNewImages(AllNewImages)

In [None]:
TrainValidationPairs, TrainTestPairs, DataSetSize = LogLossNewImages(LogAllUnitCells)

In [None]:
BestPairTrainValidation, BestPairTrainTest = FindBestPairs(TrainValidationPairs, TrainTestPairs, DataSetSize)

In [None]:
Val_Loss, Test_Loss, Val_Loss_List, Test_Loss_List = BestPairLoss(NewTrainingPaths, NewValidationPaths, NewTestPaths, BestPairTrainValidation, BestPairTrainTest)
print(Val_Loss, Test_Loss)

In [None]:
SaveLossDataPath = "/home/ug-ml/felix-ML/VAE_000/DataAllInOne_Normalised/VAE_000_2/DataAnalysis/0point1_data"
ValName = "/Validation_MostSimilar_0point1_ZMCC.npy"
TestName = "/Test_MostSimilar_0point1_ZMCC.npy"

np.save(SaveLossDataPath + ValName, Val_Loss_List)
np.save(SaveLossDataPath + TestName, Test_Loss_List)

In [None]:
average_loss = 0
Mean_ZMCC = 0
#data[0][0], data[0][1]
Rms_losses = []
reconstruction_losses = []

AverageAllImages = np.zeros(128 * 128, dtype = np.float32).reshape(128, 128)
AverageUnitCell = np.zeros(128 * 128, dtype = np.float32).reshape(128, 128)


for i in range(0, len(NewTestPaths[0])):
    x = np.load(NewTestPaths[0][i]) #Test unit cell
    y = np.load(NewTestPaths[1][i])#[:,:,0:LimitNumberImages] #Test LACBED image
    
    AverageAllImages = AverageAllImages+y
    AverageUnitCell = AverageUnitCell + x
    
    #print(np.max(x))
    #print(NewTrainingPaths[0][BestPairTrainTest[i]])
    a = np.load(NewTrainingPaths[0][BestPairTrainTest[i]])
    b = np.load(NewTrainingPaths[1][BestPairTrainTest[i]])#[:,:,0:LimitNumberImages]
    
    Input_MS =np.sum((np.log(1+a) - np.log(1+x)) ** 2)
    Output_MS = MeanSquareLogError(y, b)
    zmcc = ZMCC(y,b)
    Mean_ZMCC+=zmcc
    Rms_losses.append(Input_MS)
    #print(i)
            
    
    reconstruction_losses.append(Input_MS)
    average_loss+=Output_MS
    if zmcc <= 0.5:
        print(i, BestPairTrainTest[i])
        print("Input loss: ", Input_MS)
        print("Output loss: ", Output_MS)
        print("ZMCC loss: ", zmcc)
        w=10
        h=10
        fig=plt.figure(figsize=(8, 8))
        columns = 4
        rows = 1
        fig.add_subplot(rows, columns, 1)
        plt.imshow(x)
        fig.add_subplot(rows, columns, 2)
        plt.imshow(y)
        fig.add_subplot(rows, columns, 3)
        plt.imshow(b)
        fig.add_subplot(rows, columns, 4)
        plt.imshow(a)

        plt.show()
print("Average loss: ", average_loss / len(NewTestPaths[0]))
print("Average ZMCC is: ", Mean_ZMCC / len(NewTestPaths[0]))

In [None]:
plt.imshow(AverageAllImages)
plt.show()

plt.imshow(AverageUnitCell)
plt.show()