In [1]:
import numpy as np
from numpy import asarray

from PIL import Image

import os

import pandas as pd

import keras
from keras.layers import Dense, Softmax, Conv2D, Input, MaxPooling2D, Flatten, RandomContrast
from keras.models import Sequential, load_model
from keras.utils import to_categorical
from keras.preprocessing.image import img_to_array, load_img

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from skimage.transform import resize
from sklearn.metrics import accuracy_score, classification_report
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV, KFold, cross_validate
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset, DataLoader

from joblib import dump, load

from matplotlib import pyplot as plt



# Load the data

In [2]:
'''

CREATE THE .NPZ FILE

'''
# folder_path = './data/MixedTrainingNumpy/'

# data = {}

# for file in os.listdir(folder_path):
#     if file.endswith('.npy'):

#         file_path = os.path.join(folder_path, file)
#         array = np.load(file_path)

#         label = file[0]

#         data[label] = array

# np.savez('./data/Training.npz', **data)

'\n\nCREATE THE .NPZ FILE\n\n'

In [3]:
'''

LOADS AND "FORMALISES" DATA SO CAN BE PASSED INTO PREPROCESSING

'''

def loadData(filePath):
    images = []
    labels = []

    for filename in os.listdir(filePath):
        if filename.endswith('.npy'):

            img = np.load(os.path.join(filePath, filename))

            img = resize(img, (128, 128), anti_aliasing=True) # TODO - NOT NORMALISING DIDN'T YIELDED THE SAME VALUES

            # display_img = np.clip(img * 255, 0, 255).astype('uint8')
            # plt.imshow(display_img)
            # plt.show()
            
            images.append(img)
            
            label = filename[0]
            labels.append(label)

    imagesNP = np.array(images)
    labelsNP = np.array(labels)

    # Adjust this if your labels are not numeric
    encoder = LabelEncoder()
    intLabels = encoder.fit_transform(labelsNP)
    intLabels = to_categorical(intLabels)

    # Split the data into training and validation sets
    return imagesNP, intLabels

In [4]:
def featureExtractionModel():
    model = Sequential()
    model.add(Input(shape=(128,128,3))) # Images are 100 by 100 and RGB

    model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))#Break the image into separate sub-image
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Flatten())

    # Softmax Regression
    model.add(Dense(units=128, activation='relu'))
    model.add(Dense(units=4, activation='softmax'))
    return model

In [37]:
def main():
    filePath = './data/MixedTrainingNumpy/'
    imagesNP, labelsNP = loadData(filePath)
    print("Done")

    imTrain, imTest, labTrain, labTest = train_test_split(imagesNP, labelsNP, test_size=0.2, random_state=42)

    # kFold = KFold(n_splits=2, shuffle=True, random_state=42)

    # foldN = 1
    # bestLoss = 10

    # for train, test in kFold.split(imTrain, labTrain):
    #     model = featureExtractionModel()

    #     optimiser = keras.optimizers.Adam(learning_rate=0.001)
    #     model.compile(loss='categorical_crossentropy', metrics=['AUC', 'accuracy'], optimizer=optimiser)

    #     print(f'Training for fold {foldN}...')
    #     model.fit(imTrain[train], labTrain[train], epochs=4, batch_size=4, validation_data=(imTrain[test], labTrain[test]))

    #     foldN += 1
    #     score = model.evaluate(imTrain[test], labTrain[test], verbose=0)
    #     print(f'Score for fold {foldN}: {model.metrics_names[0]} of {score[0]}; {model.metrics_names[1]} of {score[1]*100}%')

    #     if score[0] < bestLoss:
    #         bestLoss = score[0]
    #         bestModel = model

    # bestModel.save("./CNN.h5")


    # test_score = model.evaluate(imTest, labTest, verbose=0)
    # print(f'Test Score: Loss = {test_score[0]}; AUC = {test_score[1]*100}%; Accuracy = {test_score[2]*100}%')


    model = load_model("./CNN.h5")

    featureModel = keras.Model(inputs=model.inputs, outputs=model.layers[-3].output)

    featureTrain = featureModel.predict(imTrain)
    print(featureTrain.shape)
    featureTest = featureModel.predict(imTest)
    print(featureTest.shape)

# ------------------------------------------------------------------------------------------------------------------------------------------------------


    randForestDist = { # Grid search takes too long so use Halving Random Search
        'n_estimators' : sp_randint(500, 1000),
        'criterion' : ['gini', 'entropy', 'log_loss'],
        'max_depth' : [10, 20, 30, None],
        'min_samples_split' : sp_randint(2, 10),
        'min_samples_leaf' : sp_randint(1, 8),
        'max_features' : ['log2', 'sqrt', None],
        'bootstrap' : [True, False],
        'warm_start' : [True, False],
        'class_weight' : ['balance_subsample', None],
    }

    #CRAP ==> {'bootstrap': False, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 924, 'warm_start': False} Random Forest Classifier Accuracy: 90.75043630017451%
    #n_estimators=1000, random_state=42, max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, bootstrap=False
    
    #n_estimators=100, random_state=42, max_depth=10, max_features='sqrt', 
    #                                    min_samples_leaf=10, min_samples_split=20, bootstrap=False
    
    randForest = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=10, max_features='sqrt', 
                                        min_samples_leaf=10, min_samples_split=20, bootstrap=False, ccp_alpha=0.03) #ccp_alpha=0.01 made it significantly worse
    
    crossVal = cross_validate(randForest,featureTrain, np.argmax(labTrain, axis=1), cv=5, return_train_score=True)
    print("Training score: ", crossVal['train_score'])
    print("Validation score: ", crossVal['test_score'])
    
    #forestGridSearch = HalvingRandomSearchCV(estimator=randForest, param_distributions=randForestDist, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
    #forestGridSearch.fit(featureTrain, np.argmax(labTrain, axis=1))
    #print("Best parameters found: ", forestGridSearch.best_params_)
    randForest.fit(featureTrain, np.argmax(labTrain, axis=1))

    #optimalRandForest = forestGridSearch.best_estimator_

    rfPred = randForest.predict(featureTest)

    rfPredProb = randForest.predict_proba(featureTest)
    # for i, probas in enumerate(rfPredProb):
    #     predicted_class = np.argmax(probas)
    #     confidence = np.max(probas)
    #     print(f"Confidence = {confidence}")


    # rfPredRAW = encoder.inverse_transform([np.argmax(label) for label in rfPred])
    # realLabelRAW = encoder.inverse_transform([np.argmax(label) for label in labTest])

    adjustedPred = []
    falseNegHealthy = 0

    for i, prob in enumerate(rfPredProb): # The confidences are stored as [a, b, c, d], where each one is a probability between 0 and 1. From them, if class = b and conf of this class is smaller than 0.75, predict the next highest
        classPred = np.argmax(prob)
        conf = np.max(prob)
        if classPred == 1 and conf < 0.6: #0.7 = 1 at 82%, 0.55 = 11 at 85%, 0.65 = 2 at 83.4%, 0.6 = 6 at 84.5. MAX ACCURACY: 22 at 86%.0 
            sortConf = np.argsort(prob)[::-1]
            newPred = sortConf[1]
            adjustedPred.append(newPred)
            print(f"Low confidence ({conf}). Next Prediction: {newPred} with confidence {np.max(newPred)}. Actual: {np.argmax(labTest[i])}")
        else:
            adjustedPred.append(classPred)
            print(f'Predicted: {classPred}, Confidence: {conf}, Actual: {labTest[i]}')

        if adjustedPred[i] != np.argmax(labTest[i]) and adjustedPred[i] == 1: # Count how many false negatives there are.
            print()
            print(f"adjustedPred[i] = {adjustedPred[i]}, np.argmax(labTest[i]) = {np.argmax(labTest[i])}")
            falseNegHealthy += 1

    print(falseNegHealthy)
    countReal = pd.Series(np.argmax(labTest, axis=1)).value_counts()
    countPred = pd.Series(adjustedPred).value_counts()

    compareCount = pd.DataFrame({'Actual: ': countReal, 'Pred: ': countPred})
    print(compareCount)

    accuracy = accuracy_score(np.argmax(labTest, axis=1), adjustedPred)
    print(f'Random Forest Classifier Accuracy: {accuracy * 100}%')


# ----------------------------------------------------------------------------------------------------------------------------------------------------------
'''

    # hyperparamGrid = {
    #     'n_neighbors' : [2, 4, 8],
    #     'algorithm' : ['ball_tree', 'brute'],
    #     'leaf_size' : [20, 30, 50],
    #     'p' : [1, 2, 3],
    #     #'metric' : ['auto', 'sqrt']
    # }

    knnModel = KNeighborsClassifier(n_neighbors=2, weights='uniform', leaf_size=4)
    
    crossVal = cross_validate(knnModel,featureTrain, np.argmax(labTrain, axis=1), cv=5, return_train_score=True)
    print("Training score: ", crossVal['train_score'])
    print("Validation score: ", crossVal['test_score'])
    

    knnModel.fit(featureTrain, np.argmax(labTrain, axis=1))

    knnPredProb = knnModel.predict_proba(featureTest)

    adjustedPred = []
    falseNegHealthy = 0

    for i, prob in enumerate(knnPredProb): # The confidences are stored as [a, b, c, d], where each one is a probability between 0 and 1. From them, if class = b and conf of this class is smaller than 0.75, predict the next highest
        classPred = np.argmax(prob)
        conf = np.max(prob)
        if classPred == 1 and conf < 0.7:
            sortConf = np.argsort(prob)[::-1]
            newPred = sortConf[1]
            adjustedPred.append(newPred)
            print(f"Low confidence ({conf}). Next Prediction: {newPred} with confidence {np.max(newPred)}. Actual: {np.argmax(labTest[i])}")
        else:
            adjustedPred.append(classPred)
            print(f'Predicted: {classPred}, Confidence: {conf}, Actual: {labTest[i]}')

        if classPred != np.argmax(labTest[i]) and np.argmax(labTest[i]) == 1: # Count how many false negatives there are.
            print()
            print(f"classPred = {classPred}, np.argmax(labTest[i]) = {np.argmax(labTest[i])}")
            falseNegHealthy += 1

    print(falseNegHealthy)
    countReal = pd.Series(np.argmax(labTest, axis=1)).value_counts()
    countPred = pd.Series(adjustedPred).value_counts()

    compareCount = pd.DataFrame({'Actual: ': countReal, 'Pred: ': countPred})
    print(compareCount)

    accuracy = accuracy_score(np.argmax(labTest, axis=1), adjustedPred)
    print(f'KNN Accuracy: {accuracy * 100}%')

'''

main()