In [None]:
from google.colab import files

print("Upload kaggle.json")
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# downloads to /content
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia -p /content


Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
License(s): other
Downloading chest-xray-pneumonia.zip to /content
 98% 2.25G/2.29G [00:19<00:00, 144MB/s]
100% 2.29G/2.29G [00:20<00:00, 122MB/s]


In [10]:
!unzip -q /content/chest-xray-pneumonia.zip -d /content/chest_xray
!ls -la /content/chest_xray
!find /content/chest_xray -maxdepth 2 -type d -print


total 12
drwxr-xr-x 3 root root 4096 Dec  2 17:02 .
drwxr-xr-x 1 root root 4096 Dec  2 17:02 ..
drwxr-xr-x 7 root root 4096 Dec  2 17:03 chest_xray
/content/chest_xray
/content/chest_xray/chest_xray
/content/chest_xray/chest_xray/__MACOSX
/content/chest_xray/chest_xray/val
/content/chest_xray/chest_xray/test
/content/chest_xray/chest_xray/chest_xray
/content/chest_xray/chest_xray/train


##### Imports + hyperparameters + helpful checks

In [17]:
import os, math, numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2, VGG16
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix

# Hyperparams
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 16
INITIAL_LR = 1e-3
FINE_TUNE_LR = 1e-5

# Dataset paths
TRAIN_DIR = "/content/chest_xray/chest_xray/train"
VAL_DIR   = "/content/chest_xray/chest_xray/val"
TEST_DIR  = "/content/chest_xray/chest_xray/test"

# quick file counts
def count_files(d):
    total = 0
    for root, _, files in os.walk(d):
        total += len([f for f in files if f.lower().endswith(('.png','.jpg','.jpeg'))])
    return total

print("train images (original):", count_files(TRAIN_DIR))
print("val folder images (original):", count_files(VAL_DIR))
print("test images:", count_files(TEST_DIR))


train images (original): 5216
val folder images (original): 16
test images: 624


##### Build data generators (use validation_split from TRAIN_DIR so validation isn't tiny)

In [18]:
# Create train generator with validation_split
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.08,
    height_shift_range=0.08,
    shear_range=0.08,
    zoom_range=0.08,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.10   # 10% of TRAIN_DIR for validation
)

train_gen = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training',
    shuffle=True
)

val_gen = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    shuffle=False
)

test_datagen = ImageDataGenerator(rescale=1./255)
test_gen = test_datagen.flow_from_directory(
    TEST_DIR,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

steps_per_epoch = math.ceil(train_gen.samples / BATCH_SIZE)
validation_steps = math.ceil(val_gen.samples / BATCH_SIZE)
test_steps = math.ceil(test_gen.samples / BATCH_SIZE)

print("train samples:", train_gen.samples, "val samples:", val_gen.samples, "test samples:", test_gen.samples)
print("steps_per_epoch:", steps_per_epoch, "validation_steps:", validation_steps, "test_steps:", test_steps)

Found 4695 images belonging to 2 classes.
Found 521 images belonging to 2 classes.
Found 624 images belonging to 2 classes.
train samples: 4695 val samples: 521 test samples: 624
steps_per_epoch: 294 validation_steps: 33 test_steps: 39


##### compute class weights

In [19]:
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(train_gen.classes)
class_weights = compute_class_weight('balanced', classes=classes, y=train_gen.classes)
class_weights = dict(enumerate(class_weights))
print("class_weights:", class_weights)

class_weights: {0: np.float64(1.9449047224523612), 1: np.float64(0.6730217889908257)}


##### MODEL MobileNetV2

In [20]:
base = MobileNetV2(input_shape=IMAGE_SIZE + (3,), include_top=False, weights='imagenet')
base.trainable = False

inp = Input(shape=IMAGE_SIZE + (3,))
x = base(inp, training=False)
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.4)(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)
model.compile(optimizer=Adam(INITIAL_LR), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


##### Callbacks

In [21]:
es  = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1)
rlp = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-7, verbose=1)
ckp = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

##### Stage 1: Train top (frozen base)

In [22]:
EPOCHS_STAGE1 = 8

history1 = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=validation_steps,
    epochs=EPOCHS_STAGE1,
    callbacks=[es, rlp, ckp],
    class_weight=class_weights
)

  self._warn_if_super_not_called()


Epoch 1/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step - accuracy: 0.8679 - auc: 0.9403 - loss: 0.2913
Epoch 1: val_loss improved from inf to 0.19496, saving model to best_model.h5




[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 425ms/step - accuracy: 0.8680 - auc: 0.9404 - loss: 0.2910 - val_accuracy: 0.9117 - val_auc: 0.9848 - val_loss: 0.1950 - learning_rate: 0.0010
Epoch 2/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9400 - auc: 0.9867 - loss: 0.1463
Epoch 2: val_loss improved from 0.19496 to 0.16199, saving model to best_model.h5




[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 329ms/step - accuracy: 0.9400 - auc: 0.9867 - loss: 0.1463 - val_accuracy: 0.9386 - val_auc: 0.9884 - val_loss: 0.1620 - learning_rate: 0.0010
Epoch 3/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9504 - auc: 0.9888 - loss: 0.1277
Epoch 3: val_loss did not improve from 0.16199
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 328ms/step - accuracy: 0.9504 - auc: 0.9888 - loss: 0.1277 - val_accuracy: 0.9309 - val_auc: 0.9902 - val_loss: 0.1816 - learning_rate: 0.0010
Epoch 4/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - accuracy: 0.9461 - auc: 0.9907 - loss: 0.1198
Epoch 4: val_loss improved from 0.16199 to 0.12353, saving model to best_model.h5




[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 327ms/step - accuracy: 0.9461 - auc: 0.9907 - loss: 0.1198 - val_accuracy: 0.9443 - val_auc: 0.9901 - val_loss: 0.1235 - learning_rate: 0.0010
Epoch 5/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9473 - auc: 0.9900 - loss: 0.1238
Epoch 5: val_loss did not improve from 0.12353
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 325ms/step - accuracy: 0.9473 - auc: 0.9900 - loss: 0.1237 - val_accuracy: 0.9539 - val_auc: 0.9926 - val_loss: 0.1238 - learning_rate: 0.0010
Epoch 6/8
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 0.9597 - auc: 0.9917 - loss: 0.1076
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.

Epoch 6: val_loss did not improve from 0.12353
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

##### Stage 2: Fine-tune (unfreeze some layers)

In [23]:
# unfreeze last 20 layers
for layer in base.layers[:-20]:
    layer.trainable = False
for layer in base.layers[-20:]:
    layer.trainable = True

model.compile(optimizer=Adam(FINE_TUNE_LR), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

EPOCHS_STAGE2 = 6
history2 = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=validation_steps,
    epochs=EPOCHS_STAGE2,
    callbacks=[es, rlp, ckp],
    class_weight=class_weights
)

Epoch 1/6
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step - accuracy: 0.9173 - auc: 0.9676 - loss: 0.2466
Epoch 1: val_loss did not improve from 0.12353
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 385ms/step - accuracy: 0.9173 - auc: 0.9677 - loss: 0.2465 - val_accuracy: 0.9482 - val_auc: 0.9774 - val_loss: 0.1462 - learning_rate: 1.0000e-05
Epoch 2/6
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step - accuracy: 0.9377 - auc: 0.9821 - loss: 0.1638
Epoch 2: val_loss did not improve from 0.12353
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 335ms/step - accuracy: 0.9377 - auc: 0.9821 - loss: 0.1638 - val_accuracy: 0.9386 - val_auc: 0.9871 - val_loss: 0.1449 - learning_rate: 1.0000e-05
Epoch 3/6
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294ms/step - accuracy: 0.9383 - auc: 0.9856 - loss: 0.1553
Epoch 3: val_loss improved from 0.12353 to 0.09022, saving model to best



[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 328ms/step - accuracy: 0.9383 - auc: 0.9856 - loss: 0.1553 - val_accuracy: 0.9597 - val_auc: 0.9941 - val_loss: 0.0902 - learning_rate: 1.0000e-05
Epoch 4/6
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - accuracy: 0.9467 - auc: 0.9879 - loss: 0.1314
Epoch 4: val_loss did not improve from 0.09022
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 326ms/step - accuracy: 0.9467 - auc: 0.9879 - loss: 0.1314 - val_accuracy: 0.9559 - val_auc: 0.9854 - val_loss: 0.1453 - learning_rate: 1.0000e-05
Epoch 5/6
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9441 - auc: 0.9851 - loss: 0.1480
Epoch 5: ReduceLROnPlateau reducing learning rate to 1.9999999494757505e-06.

Epoch 5: val_loss did not improve from 0.09022
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━

In [24]:
from tensorflow.keras.models import load_model
model = load_model("best_model.h5")



In [25]:
test_loss, test_acc, test_auc = model.evaluate(test_gen)
print("Test Accuracy:", test_acc)
print("Test AUC:", test_auc)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 105ms/step - accuracy: 0.7606 - auc: 0.6059 - loss: 0.8401
Test Accuracy: 0.8557692170143127
Test AUC: 0.9351304769515991
