In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

#warnings.filterwarnings("ignore")
#tf version should be 2.9.0
print(tf.__version__)

2.10.1


In [2]:
rand_seed = 42

In [None]:
dir_path = r"C:\Users\Admin\Desktop\Data\data_x"
results_dir = os.path.join(dir_path, r'logs\EfficientNet-B5')
models_dir = os.path.join(dir_path, r'models\EfficientNet-B5')

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)
    
if not os.path.isdir(models_dir):
    os.makedirs(models_dir)

In [None]:
# define the location of your dataset

TRAIN_PATH = r"C:\Users\Admin\Desktop\Data\data_x\train"
TRAIN_LABEL_PATH = r"C:\Users\Admin\Desktop\Data\data_x\train_classification.csv"

VAL_PATH = r"C:\Users\Admin\Desktop\Data\data_x\validation"
VAL_LABEL_PATH = r"C:\Users\Admin\Desktop\Data\data_x\validation_classification.csv"

IMG_DIM = (224,224)
INPUT_SHAPE = (224,224,3)

BATCH_SIZE = 32
EPOCH = 100
x_axis_inc = 1 # for plotting the training acc and loss

In [None]:
import pandas as pd

train_original_df = pd.read_csv(TRAIN_LABEL_PATH)
train_original_df.head()

In [None]:
def expand_dataframe(df):
    # Create a list to hold the new rows
    new_rows = []
    
    # Iterate through each row in the original dataframe
    for _, row in df.iterrows():
        # Duplicate the row 234 times (for indices 0-233)
        for i in range(224):
            new_row = row.copy()
            new_row['ID'] = f"{row['ID']}_x_{i:03d}"  # Wrap the original ID and add suffix
            one_hot_vector = np.zeros(224)
            one_hot_vector[i] = 1
            new_row['one_hot_vector'] = one_hot_vector.tolist()
            new_rows.append(new_row)
    
    # Create a new dataframe from the list of new rows
    new_df = pd.DataFrame(new_rows)
    
    return new_df

train_original_df = train_original_df.sample(frac=1, random_state=rand_seed).reset_index(drop=True)

train_c_erosion_label = train_original_df[['ID','c_erosion']]
# Create the new DataFrame
train_df = expand_dataframe(train_c_erosion_label)

# Shuffle the training data
#train_df = train_df.sample(frac=1, random_state=rand_seed).reset_index(drop=True)

train_df.head()

In [None]:
# slice_number = '160'
# train_df = train_df[train_df['ID'].str.endswith(slice_number)]
# train_df.head()

In [None]:
data_labels = train_df
target_labels = data_labels['c_erosion']
data_labels['image_path'] =  data_labels.apply(lambda row: (os.path.join(TRAIN_PATH, str(row['ID'])) + '.jpg'), axis=1)
data_labels.head()

In [None]:
print(data_labels.shape)

In [None]:
import pandas as pd
import os

def check_valid_files(df, column_name='image_path'):
    # Create a new column for validity
    df['is_valid_file'] = df[column_name].apply(os.path.isfile)
    
    # Print summary
    total = len(df)
    valid = df['is_valid_file'].sum()
    invalid = total - valid
    
    print(f"Total files: {total}")
    print(f"Valid files: {valid}")
    print(f"Invalid files: {invalid}")
    
    # If there are invalid files, you can get them like this:
    if invalid > 0:
        print("\nInvalid files:")
        invalid_files = df[~df['is_valid_file']][column_name]
        for file in invalid_files:
            print(file)
    
    return df

check_valid_file_df = check_valid_files(data_labels)

# You can access the results in the DataFrame
check_valid_file_df.head(10)

In [None]:
val_original_df = pd.read_csv(VAL_LABEL_PATH)
val_c_erosion_label = val_original_df[['ID','c_erosion']]
# Create the new DataFrame
val_df = expand_dataframe(val_c_erosion_label)

# Shuffle the validation data
#val_df = val_df.sample(frac=1, random_state=rand_seed).reset_index(drop=True)

val_df.head()

In [None]:
# # Filter for only the desired slice (ened with 'slice_nunber')
# val_df = val_df[val_df['ID'].str.endswith(slice_number)]

# val_df.head()

In [None]:
val_labels = val_df[['ID','c_erosion','one_hot_vector']]
target_val_labels = val_labels['c_erosion']
val_labels['image_path'] =  val_labels.apply(lambda row: (os.path.join(VAL_PATH, str(row['ID'])) + '.jpg'), axis=1)
val_labels.head()

In [None]:
print(val_labels.shape)

In [None]:
check_valid_file_df = check_valid_files(val_labels)
check_valid_file_df.head()

In [None]:
number_of_patient = 100 # max = 364, can't load more than 100 on my machine

train_data_size = 224*number_of_patient
validation_data_size = 12096 # max = 12096

print(train_data_size, validation_data_size)

In [None]:
train_data = np.array([img_to_array(load_img(img, target_size=IMG_DIM))
                       for img in data_labels['image_path'][0:train_data_size].values.tolist()]).astype('float32')

In [None]:
layer_number_data = np.array([np.array(one_hot_vector) for one_hot_vector in data_labels['one_hot_vector'][0:train_data_size].values.tolist()]).astype('float32')

In [None]:
val_data = np.array([img_to_array(load_img(img, target_size=IMG_DIM))
                       for img in val_labels['image_path'][0:validation_data_size].values.tolist()]).astype('float32')

In [None]:
val_layer_number_data = np.array([np.array(one_hot_vector) for one_hot_vector in val_labels['one_hot_vector'][0:validation_data_size].values.tolist()]).astype('float32')

In [None]:
print('Training Dataset Size:', train_data.shape)
print('Validation Dataset Size:', val_data.shape)

## Prepare for Transfer Learning

In [None]:
prep_in = tf.keras.applications.efficientnet.preprocess_input

In [None]:
x_train_new = train_data.astype('int')
x_val_new = val_data.astype('int')
layer_number_data_new = layer_number_data.astype('int')
val_layer_number_data_new = val_layer_number_data.astype('int')

In [None]:
x_train_in = prep_in(x_train_new)
x_val_in = prep_in(x_val_new)
layer_number_data = prep_in(layer_number_data_new)
val_layer_number_data = prep_in(val_layer_number_data_new)

In [None]:
train_labels_enc = target_labels[0:train_data_size].to_numpy()
val_labels_enc = target_val_labels[0:validation_data_size].to_numpy()

In [None]:
# Get the VGG16 model so we can do transfer learning
base_model = tf.keras.applications.EfficientNetB5(input_shape=INPUT_SHAPE, include_top=False, weights='imagenet')

If stuck here try

conda uninstall h5py
pip uninstall h5py 

and then

conda install h5py==3.10.0

In [None]:
print('Number of layers in the base model: ', len(base_model.layers))

In [None]:
pd.set_option('max_colwidth', None)
layers = [(layer, layer.name, layer.trainable) for layer in base_model.layers]
pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])

## Slapping AvgPool > 1024_Dense > 512_Dense > 1_output on top of the EfficientNet

In [None]:
base_model.trainable = False

pd.set_option('max_colwidth', None)
layers = [(layer, layer.name, layer.trainable) for layer in base_model.layers[10:]]
pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])

In [None]:
fig_train = 'EfficientNetB5_A_FE_train_224'
log_file = os.path.join(results_dir, 'EfficientNetB5_A_FE_train_224.csv')
model_path = os.path.join(models_dir, 'EfficientNetB5_A_FE_224.h5')

In [None]:
# Create a new model on top

layer_number_input = tf.keras.Input(shape=(224,))
inputs = tf.keras.Input(shape=INPUT_SHAPE)
# We make sure that the base_model is running in inference mode here,
# by passing 'training=False'. This is important for fine-tuning
x = base_model(inputs, training=False)

x = GlobalAveragePooling2D()(x)
#x = Dense(1024, activation='sigmoid')(x)
x = tf.keras.layers.Concatenate(axis=1)([x, layer_number_input])

x = Dense(1024, activation='relu')(x)
x = Dense(512, activation='relu')(x)

# A Dense classifier with a single unit (binary classification)
outputs = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[inputs,layer_number_input], outputs=outputs)

In [None]:
pd.set_option('max_colwidth', None)
layers = [(layer, layer.name, layer.trainable) for layer in model.layers]
pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Layer Trainable'])

In [None]:
model.summary()

In [None]:
callbacks = [
    CSVLogger(log_file),
    ModelCheckpoint(
        filepath=os.path.join(models_dir,"Eff-epoch-{epoch:02d}.h5"),
        save_weights_only=False,
        save_best_only=False,
        save_freq='epoch',
        verbose=1
    )
]

In [None]:
model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False), # change from_logits=True
              optimizer= keras.optimizers.RMSprop(learning_rate=1e-3),
              metrics = [keras.metrics.BinaryAccuracy()])

In [None]:
history = model.fit(x=[x_train_in,layer_number_data], 
                    y=train_labels_enc,
                    validation_data=([x_val_in,val_layer_number_data], val_labels_enc),
                    epochs=EPOCH, 
                    verbose=1, 
                    callbacks=callbacks)