<a href="https://colab.research.google.com/github/Git2723122/y4-dissertation/blob/main/cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Manually installs keras-tuner (not built into colab).
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import sys
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Input, GlobalAveragePooling2D
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import normalize, to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.losses import CategoricalFocalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from keras_tuner import RandomSearch
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
import cv2
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import gc
save = set(globals().keys())




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Create access variables for relevant dataset directories.
image_folder_dir = "/content/drive/MyDrive/HAM10000_images/"
metadata_file_dir = "/content/drive/MyDrive/HAM10000_metadata.csv"

#Convert metadata csv into a pandas dataframe for manipulation.
metadata = pd.read_csv(metadata_file_dir)

#This should print the first 5 metadata rows, otherwise something is wrong.
print(metadata.head())


     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear


In [None]:
#Add a new column to the dataframe containing numerical labels of each classification (the CNN requires numerical outputs)
text_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
labels = {label: i for i, label in enumerate(text_labels)}
metadata["label"] = metadata["dx"].map(labels)

#Through the metadata file, finds the directory for each image and accesses it.
#Performs normalization on each image's size and pixel values
#Appends each normalized images data to list X and its numerical label to list y
X, y = [],[]

def process_image(image_id):
  #Image Access
  image_dir=os.path.join(image_folder_dir, image_id + ".jpg")
  image = cv2.imread(image_dir)
  if image is None:
    return None, None
  #Normalize
  image = cv2.resize(image, (75,100))
  image = image/255.0
  #Append relevant info
  return image

#Processes each image and provides a progress bar. Uses multiple threads for speed.
with ThreadPoolExecutor(max_workers=16) as executor:
  results = list(tqdm(executor.map(process_image, metadata["image_id"]), total=len(metadata)))

#Prepare the samples and their labels and convert them into NumPy arrays for Keras.
for i, (image) in enumerate(results):
  if image is not None:
    X.append(image)
    y.append(metadata["label"].iloc[i])

X = np.array(X, dtype=np.float32)
y_initial = np.array(y, dtype=np.int32)





100%|██████████| 10015/10015 [08:04<00:00, 20.66it/s]


In [None]:
#Instantiate the random undersampler. Undersample only the majority (nv) to 1800
underSampler = RandomUnderSampler(sampling_strategy={5:1800}, random_state=42)
#Reshape and undersample then revert shape.
X_undersampled,y_undersampled = underSampler.fit_resample(X.reshape(len(X), -1), y_initial)
X_undersampled = X_undersampled.reshape(-1,100,75,3)
unique, counts = np.unique(y_undersampled, return_counts=True)
print(dict(zip(unique, counts)))

{0: 327, 1: 514, 2: 1099, 3: 115, 4: 1113, 5: 1800, 6: 142}


In [None]:
#Define the data augmentor and its possible transformations.
generator = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.3,
    height_shift_range=0.3,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range = [0.75,1.25],
    fill_mode='nearest',
    )
#Collect the number of samples in each classes
class_counts = np.unique(y_undersampled, return_counts=True)[1]
class_counts_dict = {i: count for i, count in enumerate(class_counts)}
#Set the threshold for classes to augment.
target_count = 600
X_augmented = []
y_augmented = []
for label, count in class_counts_dict.items():
  #Check if the number of samples in the class is below the maximum
  if count < target_count:
    diff = target_count - count
    indices = np.where(y_initial == label)[0]
    for i in range(diff):
      image_to_augment_index = np.random.choice(indices)
      image_to_augment = X[image_to_augment_index]
      image_to_augment = np.expand_dims(image_to_augment, axis=0)
      augmented_image = generator.flow(image_to_augment, batch_size=1)[0][0]
      X_augmented.append(augmented_image)
      y_augmented.append(label)
#Convert the lists into arrays and join them with their preexisting arrays.
X_augmented = np.array(X_augmented)
y_augmented = np.array(y_augmented)

X_new = np.concatenate((X_undersampled, X_augmented), axis=0)
y_new = np.concatenate((y_undersampled, y_augmented), axis=0)

new_counts = np.unique(y_new, return_counts=True)[1]
print(new_counts)


[ 600  600 1099  600 1113 1800  600]


In [None]:
#Instantiate the SMOTE oversampler. Oversample all but nv to 1800.
overSampler = SMOTE(sampling_strategy={0: 1800, 1: 1800, 2: 1800, 3: 1800, 4: 1800, 6: 1800}, random_state=42)
#Reshape and oversample then revert shape.
X_oversampled, y_oversampled = overSampler.fit_resample(X_new.reshape(len(X_new), -1), y_new)
X_oversampled = X_oversampled.reshape(-1, 100, 75, 3)
unique, counts = np.unique(y_oversampled, return_counts=True)
print(dict(zip(unique, counts)))

{0: 1800, 1: 1800, 2: 1800, 3: 1800, 4: 1800, 5: 1800, 6: 1800}


In [None]:
#Shuffle the balanced dataset.
X_ready, y_ready = shuffle(X_oversampled, y_oversampled, random_state=42)
print(f"Training Set: {X_ready.shape}, Test Set: {y_ready.shape}")

Training Set: (12600, 100, 75, 3), Test Set: (12600,)


In [None]:
#Delete all variables that arent related to libraries or built in functions and not in the whitelist.
#Saves a significant amount of RAM.
allvars = set(globals().keys())
whitelist = {"X_ready","y_ready","labels"}
blacklist = allvars-save
print(blacklist)
for var in blacklist:
  if var not in whitelist:
    del globals()[var]

gc.collect()
print(globals().keys())

{'counts', '_i13', 'count', 'image_folder_dir', 'i', 'executor', '_i10', 'new_counts', 'label', 'metadata', 'process_image', 'y', 'X_augmented', 'labels', 'class_counts', 'y_oversampled', '_i11', '_i9', '_i5', '_i3', 'X', '_i8', '_i6', 'overSampler', 'X_new', 'X_oversampled', 'y_ready', 'text_labels', 'metadata_file_dir', 'save', '_i7', 'y_initial', 'y_new', '_i14', 'image_to_augment', 'unique', 'y_undersampled', 'augmented_image', 'y_augmented', 'drive', 'X_undersampled', '_i12', 'underSampler', 'generator', 'diff', 'image', '_i4', '_i15', 'indices', 'target_count', 'image_to_augment_index', 'X_ready', 'class_counts_dict', 'results', '_i16'}
dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1', '_exit_code', '_i2', 'tf', 'np', 'pd', 'os', 'sys', 'Sequential', 'Model', 'Dense', 'Dropout', 'Activation', 'Flatten', 'Conv2D', 'M

In [None]:
#One hot encode the labels, representing each as a 0 or 1 in a vector.
y = to_categorical(y_ready, len(labels))

#Split the dataset into 80% training 20% testing
x_train, x_test, y_train, y_test = train_test_split(X_ready, y, test_size = 0.2, stratify=y, random_state=42)
print(f"Training Set: {x_train.shape}, Test Set: {x_test.shape}")
print(f"Training Labels: {y_train.shape}, Test Labels: {y_test.shape}")

Training Set: (10080, 100, 75, 3), Test Set: (2520, 100, 75, 3)
Training Labels: (10080, 7), Test Labels: (2520, 7)


In [None]:
early_stopping=EarlyStopping(monitor='val_loss',patience=5,verbose=1, restore_best_weights=True)

In [None]:
#Create the model and set all potential hyperparam values incl. layer count
def build_model(hp):
  model = Sequential()
  model.add(Input(shape=(100,75,3)))
  model.add(Conv2D(filters=hp.Int('filters_base', 16, 128, step=16), kernel_size=(3,3), activation='relu'))
  model.add(MaxPooling2D((2,2)))
  model.add(Dropout(rate=hp.Float('rate_base', 0.1, 0.4, step=0.1)))

  num_layers = hp.Int('num_layers', 1, 4)
  for i in range(num_layers):
    model.add(Conv2D(filters=hp.Int(f'F_{i}', 32, 256, step=32), kernel_size=(3,3), activation='relu'))
    model.add(MaxPooling2D((2,2)))
    model.add(Dropout(rate=hp.Float(f'R_{i}', 0.1, 0.4, step=0.1)))

  model.add(Flatten())
  for i in range(hp.Int('num_dense_layers', 1,2)):
    model.add(Dense(units=hp.Int(f'neurons_{i}',64,256,step=64), activation='relu'))
    model.add(Dropout(rate=hp.Float(f'rate_dense_{i}', 0.1, 0.4, step=0.1)))

  model.add(Dense(7, activation='softmax'))

  model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])),
              loss=CategoricalFocalCrossentropy(label_smoothing=0.1),
              metrics=['accuracy'])

  return model
#Create the RandomSearcher
tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective('val_accuracy', direction='max'),
    max_trials=15,
    directory='/content/tuning_results22',
    seed=42
)
#Run the search against the data.)
tuner.search(x_train, y_train, epochs=20, validation_data=(x_test, y_test), verbose=1, callbacks=[early_stopping])

Trial 15 Complete [00h 01m 49s]
val_accuracy: 0.5976190567016602

Best val_accuracy So Far: 0.7869047522544861
Total elapsed time: 00h 46m 32s


In [None]:
#Run model with optimal params
tuned_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: {tuned_hps.values}")
tuned_model = tuner.hypermodel.build(tuned_hps)
tuned_model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test),callbacks=[early_stopping])

Best Hyperparameters: {'filters_base': 64, 'rate_base': 0.1, 'num_layers': 3, 'F_0': 96, 'R_0': 0.30000000000000004, 'num_dense_layers': 1, 'neurons_0': 192, 'rate_dense_0': 0.2, 'learning_rate': 0.001, 'F_1': 160, 'R_1': 0.1, 'F_2': 224, 'R_2': 0.4, 'neurons_1': 128, 'rate_dense_1': 0.30000000000000004, 'F_3': 192, 'R_3': 0.1}
Epoch 1/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.2161 - loss: 0.3364 - val_accuracy: 0.3956 - val_loss: 0.2618
Epoch 2/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 17ms/step - accuracy: 0.4048 - loss: 0.2618 - val_accuracy: 0.4599 - val_loss: 0.2368
Epoch 3/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.4826 - loss: 0.2367 - val_accuracy: 0.5175 - val_loss: 0.2170
Epoch 4/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.5140 - loss: 0.2234 - val_accuracy: 0.5306 - val_loss: 0.2139
Epoch 5/50
[1m

<keras.src.callbacks.history.History at 0x7f1e9433b750>

In [None]:
#Show Stats
y_pred = tuned_model.predict(x_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

overall_accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Overall Accuracy: {overall_accuracy:.4f}")

report = classification_report(y_test_classes, y_pred_classes, target_names=[f"Class {i}" for i in range(7)])
print(report)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Overall Accuracy: 0.8337
              precision    recall  f1-score   support

     Class 0       0.79      0.72      0.75       360
     Class 1       0.77      0.69      0.73       360
     Class 2       0.87      0.90      0.88       360
     Class 3       0.68      0.79      0.73       360
     Class 4       0.94      0.97      0.96       360
     Class 5       0.90      0.91      0.90       360
     Class 6       0.89      0.85      0.87       360

    accuracy                           0.83      2520
   macro avg       0.84      0.83      0.83      2520
weighted avg       0.84      0.83      0.83      2520

Accuracy for Class 0: 0.7167
Accuracy for Class 1: 0.6944
Accuracy for Class 2: 0.8972
Accuracy for Class 3: 0.7944
Accuracy for Class 4: 0.9750
Accuracy for Class 5: 0.9056
Accuracy for Class 6: 0.8528


Softmax Regression model and SVM. Code not covered in the dissertation.

In [None]:
#Prepare the data without 1 hot encoding
x_trainml, x_testml, y_trainml, y_testml = train_test_split(X_ready, y_ready, test_size = 0.2, stratify=y_ready, random_state=42)
x_train_flat = x_trainml.reshape(x_trainml.shape[0], -1)
x_test_flat = x_testml.reshape(x_testml.shape[0], -1)

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train_flat)
x_test_scaled = scaler.transform(x_test_flat)


In [None]:
#DOES NOT WORK (SVM, MENTIONED IN DISSERTATION EVALUATION)
clf = LinearSVC(dual=False,max_iter=1000)
clf.fit(x_train_scaled, y_trainml)
y_pred_svm = clf.predict(x_test_scaled)

In [None]:
#DOES NOT WORK (^)
predsvm = clf.predict(x_test_scaled)
print(classification_report(y_test.argmax(axis=1), predsvm))


NameError: name 'clf' is not defined

In [None]:
#Run the SR model
y_train_onehot = to_categorical(y_trainml, num_classes=len(labels))
y_test_onehot = to_categorical(y_testml, num_classes=len(labels))

lr = Sequential([
    Dense(7, activation='softmax', input_shape=(x_train_scaled.shape[1],))
])

lr.compile(optimizer=Adam(learning_rate=0.01),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

lr.fit(x_train_scaled, y_train_onehot, epochs=50, validation_data=(x_test_scaled, y_test_onehot),callbacks=[early_stopping])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.3338 - loss: 45.3985 - val_accuracy: 0.3889 - val_loss: 35.8682
Epoch 2/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.4515 - loss: 29.7431 - val_accuracy: 0.4659 - val_loss: 30.9939
Epoch 3/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.4934 - loss: 30.4703 - val_accuracy: 0.4849 - val_loss: 32.1332
Epoch 4/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5455 - loss: 24.9936 - val_accuracy: 0.5024 - val_loss: 27.0838
Epoch 5/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5676 - loss: 23.8656 - val_accuracy: 0.5202 - val_loss: 28.0364
Epoch 6/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5819 - loss: 26.0136 - val_accuracy: 0.4909 - val_loss: 42.4280
Epoch 7/50
[1m

<keras.src.callbacks.history.History at 0x7f1e3c67c750>

In [None]:
predlr = lr.predict(x_test_scaled)
predictions = np.argmax(predlr, axis=1)
#Show SR results.
print(classification_report(y_testml, predictions))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.71      0.64      0.67       360
           1       0.63      0.72      0.67       360
           2       0.61      0.47      0.53       360
           3       0.65      0.56      0.60       360
           4       0.45      0.58      0.51       360
           5       0.44      0.37      0.40       360
           6       0.62      0.74      0.68       360

    accuracy                           0.58      2520
   macro avg       0.59      0.58      0.58      2520
weighted avg       0.59      0.58      0.58      2520

