In [None]:
import time
import os
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import svm
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Training on Chest X-ray14

In [None]:
#The file_list will contain the files that will be used for training on the Chest X-ray 14 dataset
#The test_list will contain the files that will be used for testing
file_list = []
test_list = []
#Path to the npz files that contain the feature representations
directory =  R'D:\SSD downloads\Processed Chest X-ray 14\\'


for filename in os.listdir(directory):
    #This stores the last 11 npz files for testing, otherwise they are added to the file_list
    for number in range(110, 121):
        if str(number) in filename:
            path = os.path.join(directory, filename)
            if os.path.isfile(path):
                test_list.append(path)
        else:
            path = os.path.join(directory, filename)
            if os.path.isfile(path):
                file_list.append(path)

In [None]:
#The classifiers list will contain the 14 different classifers for each different class
classifiers = []
#counter is used to stop the model training
counter = 0

#Initialization of the Fully connected layer
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(14 ,activation ="softmax", input_shape=(14 * 14 * 2048,)))
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy'])



#Loop over the npz files and load them, depending on what key they are stored as, they're loaded into the X_train or y_train 
#lists, once all the values from a npz file have been placed into the list, they are used to train the fully connected layer
for file in file_list:
    #These variables are reset for new file that is loaded
    loaded_array = np.load(file)
    train_image_data = []
    train_label = []
    #Checking what key the current value from the npz file has
    for key in loaded_array:
        if "Label" in key:
            train_label.append(loaded_array[key].tolist())
        else:
            train_image_data.append(loaded_array[key])   
    #Instantiate X_train and y_train, converting them both to tensors
    y_train = train_label
    y_train = np.array(y_train)
    X_train = train_image_data
    X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
    y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
    #Train model on the current data laoded 
    model.fit(X_train, y_train, epochs=30, batch_size=64)  
    counter += 1
    print(counter)
    if counter == 20:
        break

### Testing on Chest X-ray14

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


#counter is used to stop the model testing after x amount of iterations 
counter = 0
#AUC score will be stored in roc_auc variable
roc_auc = 0


for file in test_list:
    #These variables are reset for new file that is loaded
    X_test = []
    y_test = []
    loaded_array = np.load(file)
    #Checking what key the current value from the npz file has
    for key in loaded_array:
        if "Label" in key:
            y_test.append(loaded_array[key].tolist())
        else:
            X_test.append(loaded_array[key])
    #Instantiate X_test and y_test, converting them both to tensors
    y_test = np.array(y_test)
    X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)
    y_pred = model.predict(X_test)    
    #Calculate the AUC-ROC for each label
    for i in range(y_test.shape[1]):
        if i in [3,8,7,12,2]:
        #These values represent the indexs of [Effusion, Atelectasis, Consolidation, Cardiomegaly, Edema] in the y_prob[:,i] list
            roc_auc += roc_auc_score(y_test[:, i], y_pred[:, i])
            print(f"AUC for label {i}: {roc_auc_score(y_test[:, i], y_pred[:, i])}")
    counter += 1
    if counter == 2:
        #AUC average score is calculated based of how many files have been iterated over
        roc_auc/= 5 * counter
        break

In [None]:
#Average AUC score for in-distribution data
print(roc_auc)

### Testing on CheXpert

In [None]:
#All the CheXpert files will be loaded into the test_list for out of distribution evaluation
test_list = []
directory = R'D:\Processed CheXpert\\'

for filename in os.listdir(directory):
    path = os.path.join(directory, filename)
    if os.path.isfile(path):
        test_list.append(path)
        #After 5 files have been added to test_list it breaks the loop
    if len(test_list) == 5:
        break

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


#counter is used to stop the model testing after x amount of iterations 
counter = 0
#AUC score will be stored in roc_auc variable
roc_auc = 0


for file in test_list:
    #These variables are reset for new file that is loaded
    X_test = []
    y_test = []
    loaded_array = np.load(file)
    #Checking what key the current value from the npz file has
    for key in loaded_array:
        if "Label" in key:
            y_test.append(loaded_array[key].tolist())
        else:
            X_test.append(loaded_array[key])
    #Instantiate X_test and y_test, converting them both to tensors
    y_test = np.array(y_test)
    X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)
    y_pred = model.predict(X_test)    
    #Calculate the AUC-ROC for each label
    for i in range(y_test.shape[1]):            
        #These values represent the indexs of [Edema, Effusion, Atelectasis, Consolidation, Cardiomegaly] in the y_prob[:,i] list
        if i in [0,6,7,9,10]:
            roc_auc += roc_auc_score(y_test[:, i], y_pred[:, i])
            print(f"AUC for label {i}: {roc_auc_score(y_test[:, i], y_pred[:, i])}")
    counter += 1
    if counter == 2:
        #AUC average score is calculated based of how many files have been iterated over
        roc_auc/= 5 * counter
        break

In [None]:
#Average AUC score for out of distribution data
print(roc_auc)