In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [2]:
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Memory Management: This prevents TensorFlow from hogging all VRAM at once
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

print("GPU Memory growth enabled. Ready for training.")

GPU Memory growth enabled. Ready for training.


In [3]:
# --- Define labels and build model architecture ---

# We must redefine labels in this notebook as kernels are independent
labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 
          'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 
          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

# 1. Load Pre-trained DenseNet121
base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# 2. Add custom head for our specific 14 pathologies
x = base_model.output
x = GlobalAveragePooling2D()(x) 
x = Dropout(0.2)(x)             
predictions = Dense(len(labels), activation='sigmoid')(x) 

# 3. Final model assembly
model = Model(inputs=base_model.input, outputs=predictions)

# 4. Compile with optimized settings for 4GB VRAM
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='binary_crossentropy', 
              metrics=[tf.keras.metrics.AUC(name='auc', multi_label=True)])

print(f"Model successfully built for {len(labels)} classes.")

Model successfully built for 14 classes.


In [4]:
import os # Import OS to handle folder creation

# 1. Create a folder for saving weights safely
if not os.path.exists('models'): 
    os.makedirs('models')
    print("Created 'models' directory.")

# 2. Define the callback for saving the best model during training
checkpoint = ModelCheckpoint(
    filepath='models/best_model.keras', 
    monitor='val_loss', 
    save_best_only=True, 
    verbose=1
)

# 3. Define EarlyStopping to avoid over-training and wasting GPU time
early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True
)

# 4. Define Learning Rate reduction for finer tuning
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=2, 
    verbose=1, 
    min_lr=1e-7
)

callbacks_list = [checkpoint, early_stop, reduce_lr]
print("Callbacks defined and ready.")

Callbacks defined and ready.


In [7]:
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import GroupShuffleSplit

# 1. Load data
df = pd.read_csv('../data/cleaned_data.csv')
labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 
          'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 
          'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

# 2. Split Data (Patient-Aware)
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df['Patient ID']))
train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

gss_val = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=42)
val_idx, test_idx = next(gss_val.split(temp_df, groups=temp_df['Patient ID']))
valid_df = temp_df.iloc[val_idx]

# 3. Create Generators (Optimized for 4GB VRAM)
train_idg = ImageDataGenerator(rescale=1./255, horizontal_flip=True, rotation_range=15, zoom_range=0.1)
test_idg = ImageDataGenerator(rescale=1./255)

train_gen = train_idg.flow_from_dataframe(dataframe=train_df, x_col='path', y_col=labels,
                                          class_mode='raw', target_size=(224, 224), batch_size=4)

valid_gen = test_idg.flow_from_dataframe(dataframe=valid_df, x_col='path', y_col=labels,
                                         class_mode='raw', target_size=(224, 224), batch_size=4)

print("Generators Re-initialized in Notebook 03!")

Found 89859 validated image filenames.
Found 10987 validated image filenames.
Generators Re-initialized in Notebook 03!


In [8]:
# Calculating steps based on our batch size
STEP_SIZE_TRAIN = train_gen.n // train_gen.batch_size
STEP_SIZE_VALID = valid_gen.n // valid_gen.batch_size

print("Starting training... Go grab a coffee, this will take time!")

history = model.fit(
    train_gen,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=valid_gen,
    validation_steps=STEP_SIZE_VALID,
    epochs=20, # 20 is a good starting point, EarlyStopping will handle it
    callbacks=callbacks_list
)

Starting training... Go grab a coffee, this will take time!
Epoch 1/20
Epoch 1: val_loss improved from inf to 0.18059, saving model to models\best_model.keras
Epoch 2/20
Epoch 2: val_loss improved from 0.18059 to 0.16052, saving model to models\best_model.keras
Epoch 3/20
Epoch 3: val_loss did not improve from 0.16052
Epoch 4/20
Epoch 4: val_loss did not improve from 0.16052

Epoch 4: ReduceLROnPlateau reducing learning rate to 1.9999999494757503e-05.
Epoch 5/20
Epoch 5: val_loss improved from 0.16052 to 0.15659, saving model to models\best_model.keras
Epoch 6/20
Epoch 6: val_loss did not improve from 0.15659
Epoch 7/20
Epoch 7: val_loss improved from 0.15659 to 0.15263, saving model to models\best_model.keras
Epoch 8/20
Epoch 8: val_loss did not improve from 0.15263
Epoch 9/20
Epoch 9: val_loss did not improve from 0.15263

Epoch 9: ReduceLROnPlateau reducing learning rate to 3.999999898951501e-06.
Epoch 10/20
Epoch 10: val_loss improved from 0.15263 to 0.15046, saving model to models

Technical Note on Training Termination (Early Stopping Analysis):

The training process was intentionally terminated at Epoch 18 despite a maximum limit of 20 epochs. This behavior is attributed to the EarlyStopping callback with a monitored metric of val_loss and a patience setting of 5.Optimal Convergence: The model reached its peak performance (minimum validation loss) at Epoch 13.The 5-Epoch Rule: From Epoch 14 to 18, the model failed to achieve a lower val_loss than the record set in Epoch 13.Prevention of Overfitting: Upon reaching the threshold of 5 consecutive non-improving epochs, the system triggered a shutdown to prevent overfitting and "weight drifting."Result: The final saved weights (best_model.keras) correspond to the state at Epoch 13, ensuring the most generalizable version of the network is preserved.

In [11]:
import json

# Convert float32 values to standard python floats
# We iterate through each key (like 'auc', 'loss') and convert its list of values
history_dict = {
    key: [float(i) for i in value] 
    for key, value in history.history.items()
}

# Now save it to your models folder - this will work perfectly!
with open('models/train_history.json', 'w') as f:
    json.dump(history_dict, f)

print("History saved successfully as train_history.json!")

History saved successfully as train_history.json!
