# Medical Image Classifier

In [1]:
!pip install seaborn



In [2]:
import os
import glob
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import boto3

import sagemaker
from sagemaker import image_uris
from sagemaker import get_execution_role
from sagemaker.tuner import CategoricalParameter, ContinuousParameter,  HyperparameterTuner
from sagemaker.session import TrainingInput

sess = sagemaker.Session()



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## 01. Import and save Kaggle dataset

- Do not repeat. Dataset has already been imported.

In [None]:
!pip install -q kaggle

In [None]:
!mkdir ~/.kaggle

In [None]:
!touch ~/.kaggle/kaggle.json

In [None]:
api_token = {"username": "dorregocabrera", "key": "5eb3899dd2c8424c42108b634e734241"}

In [None]:
import json

with open(".kaggle/kaggle.json", "w") as file:
    json.dump(api_token, file)

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia --force

In [None]:
import zipfile
with zipfile.ZipFile("./chest-xray-pneumonia.zip", "r") as zip_ref:
    zip_ref.extractall("./data")

## 02. Visualize randomly-selected x-ray

In [None]:
import glob
import random
import matplotlib.pyplot as plt

def get_random_image(dir,  condition):
    placeholder = ""

    if condition == "n":
        placeholder = "NORMAL"
    elif condition == "p":
        placeholder = "PNEUMONIA"
    else:
        raise Exception("Sorry, invalid condition")

    folder = f"./data/chest_xray/{dir}/{placeholder}/*.jpeg"
    img_paths = glob.glob(folder)
    max_length = len(img_paths)
    random_number = random.randint(0, max_length)

    for index, item in enumerate(img_paths, start=1):
        if index == random_number:
            print(index, item)
            image = plt.imread(item)
            ready_image = plt.imshow(image)
            return ready_image

In [None]:
get_random_image("train", "p")

In [None]:
from PIL import Image

image = Image.open("data/chest_xray/val/NORMAL/NORMAL2-IM-1427-0001.jpeg")
print(image.format)
print(image.size)
print(image.mode)

## 03. Resize images and ensure images are greyscale

In [None]:
import glob
import matplotlib.pyplot as plt
from PIL import Image

# training dataset

folder = "./data/chest_xray/train/*/*.jpeg"

counter_pneu = 0
counter_normal = 0

img_paths = glob.glob(folder)

for i in img_paths:
    if "person" in i:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/train" + "/train_pneumonia" + str(counter_pneu) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_pneu += 1
    
    else:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/train" + "/train_normal" + str(counter_normal) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_normal += 1

In [None]:
# test dataset
folder = "./data/chest_xray/test/*/*.jpeg"

counter_pneu = 0
counter_normal = 0

img_paths = glob.glob(folder)

for i in img_paths:
    if "person" in i:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/test" + "/test_pneumonia" + str(counter_pneu) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_pneu += 1
    
    else:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/test" + "/test_normal" + str(counter_normal) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_normal += 1

In [None]:
# validation dataset
folder = "./data/chest_xray/val/*/*.jpeg"

counter_pneu = 0
counter_normal = 0

img_paths = glob.glob(folder)

for i in img_paths:
    if "person" in i:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/val" + "/val_pneumonia" + str(counter_pneu) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_pneu += 1
    
    else:
        full_size_image = Image.open(i)
        im = full_size_image.resize((224, 224))
        plt.imsave(
            fname="./data/chest_xray/val" + "/val_normal" + str(counter_normal) + ".jpeg",
            arr=im,
            format="jpeg",
            cmap="gray"
        )
        counter_normal += 1

## 04. Create visualizations

In [None]:
# prepare data to create dataframe 

import glob
import pandas

folder = "./data/chest_xray/*/*.jpeg"

category = []
condition_of_lung = []
filenames = []

all_files = glob.glob(folder)

for filename in all_files:
    filenames.append(filename)
    
    if "train" in filename:
        category.append("train")

        if "pneumonia" in filename:
            condition_of_lung.append("pneumonia")
        
        elif "normal" in filename:
            condition_of_lung.append("normal")

        else:
            condition_of_lung.append(np.nan)
    
    elif "val" in filename:
        category.append("val")
        
        if "pneumonia" in filename:
            condition_of_lung.append("pneumonia")
        
        elif "normal" in filename:
            condition_of_lung.append("normal")
        
        else:
            condition_of_lung.append(np.nan)

    elif "test" in filename:
        category.append("test")
        
        if "pneumonia" in filename:
            condition_of_lung.append("pneumonia")
        
        elif "normal" in filename:
            condition_of_lung.append("normal")

        else:
            condition_of_lung.append(np.nan)

    else:
        category.append(np.nan)
        
        if "pneumonia" in filename:
            condition_of_lung.append("pneumonia")
        
        elif "normal" in filename:
            condition_of_lung.append("normal")

        else:
            condition_of_lung.append(np.nan)

In [None]:
# create dataframe

all_data_df = pd.DataFrame({
    "dataset_type": category,
    "x_ray_result": condition_of_lung,
    "filename": filenames,
})

all_data_df.head()
    

In [None]:
all_data_df.shape

In [None]:
# create bar graphs
g = sns.catplot(x="x_ray_result", col="dataset_type", kind="count", palette="ch:.55", data=all_data_df, legend=True)

for i in range(0, 3):
    ax = g.facet_axis(0, i)
    for p in ax.patches:
        ax.text(
            p.get_x() + 0.3,
            p.get_height() * 1.05,
            "{0:.0f}".format(p.get_height()),
            color="black",
            rotation="horizontal",
            size="large",
        )

In [None]:
# create train dataframe to transform to LST file
train_folder = "./data/chest_xray/train/*.jpeg"
train_df_lst = pd.DataFrame(columns=["labels", "s3_path"], dtype=object)
train_imgs_path = glob.glob(train_folder)
counter = 0
class_arg = ""

for path in train_imgs_path:
    if "pneumonia" in path:
        class_arg = 1
    else:
        class_arg = 0

    train_df_lst.loc[counter] = [class_arg, os.path.basename(path)]
    counter += 1

print(train_df_lst.head())
                            

In [None]:
# create test dataframe to transform to LST file
test_folder = "./data/chest_xray/test/*.jpeg"
test_df_lst = pd.DataFrame(columns=["labels", "s3_path"], dtype=object)
test_imgs_path = glob.glob(test_folder)
counter = 0
class_arg = ""

for path in test_imgs_path:
    if "pneumonia" in path:
        class_arg = 1
    else:
        class_arg = 0

    test_df_lst.loc[counter] = [class_arg, os.path.basename(path)]
    counter += 1

print(test_df_lst.head())

In [None]:
# create LST files
def save_to_lst(df, prefix):
    return df[["labels", "s3_path"]].to_csv(
        f"{prefix}.lst", sep="\t", index=True, header=False
    )

# train dataset
save_to_lst(train_df_lst.copy(), "train")

# test dataset
save_to_lst(test_df_lst.copy(), "test")

In [3]:
# save LST file to S3 bucket
bucket = "medical-ai-chest-xray-333"
print("bucket: {}".format(bucket))

region = "us-east-1"
print("region: {}".format(region))

arn_role = "arn:aws:s3:::medical-ai-chest-xray-333"
print("ARN role: {}".format(arn_role))


bucket: medical-ai-chest-xray-333
region: us-east-1
ARN role: arn:aws:s3:::medical-ai-chest-xray-333


In [4]:
os.environ["DEFAULT_S3_BUCKET"] = bucket

In [None]:
# !aws s3 sync ./data/chest_xray/train s3://${DEFAULT_S3_BUCKET}/train/

In [None]:
# !aws s3 sync ./data/chest_xray/test s3://${DEFAULT_S3_BUCKET}/test/

In [None]:
# boto3.Session().resource("s3").Bucket(bucket).Object("train.lst").upload_file("./train.lst")

In [None]:
# boto3.Session().resource("s3").Bucket(bucket).Object("test.lst").upload_file("./test.lst")

## 05. Training estimator

In [5]:
# get Docker image for image classifier from Elastic container registry
algorithm_image = image_uris.retrieve(
    region=boto3.Session().region_name,
    framework="image-classification",    
)

s3_output_location = f"s3://{bucket}/models/image_model"
print(algorithm_image)

811284229777.dkr.ecr.us-east-1.amazonaws.com/image-classification:1


In [6]:
role = get_execution_role()
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
arn:aws:iam::588336835903:role/service-role/AmazonSageMaker-ExecutionRole-20231213T093692


In [7]:
# increasing # of instances will increase training speed
# it will come at the cost of accuracy
# input_mode as 'pipe' allows data to be streamed in from S3. No need to save images to local drive
img_classifier_model = sagemaker.estimator.Estimator(
    algorithm_image,
    role=role,
    instant_count=1,
    instance_type="ml.p2.xlarge",
    volume_size=50,
    max_run=432000,
    input_mode="File", # copy all training images to the local directory
    output_path=s3_output_location,
    sagemaker_session=sess,
)
print(img_classifier_model)

<sagemaker.estimator.Estimator object at 0x7f62e8786110>


In [8]:
# count number of training images
count = 0

for filepath in glob.glob("./data/chest_xray/train/*.jpeg"):
    count += 1
print(count)

0


In [9]:
# set up hyperparameters
img_classifier_model.set_hyperparameters(
    image_shape="3,224,224",
    num_classes=2,
    user_pretrained_model=1, # turns on transfer learning
    num_training_samples=5216,
    augmentation_type="crop_color_transformation",
    epochs=15,
    early_stopping=True, # avoids overfitting and expensive training
    early_stopping_min_epochs=8, # minimum number allowed
    early_stopping_patience=5,
    early_stopping_tolerance=0.0,
    lr_scheduler_factor=0.1,
    lr_scheduler_step="8,10,12",
)

In [10]:
# tune hyperparameters
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.01, 0.1),
    # minim_batch_size is number of training examples in 1 forward and backward pass
    "mini_batch_size": CategoricalParameter([8, 16, 32]),
    "optimizer": CategoricalParameter(["sgd", "adam"]),
}

In [11]:
objective_metric_name = "validation:accuracy"
objective_type = "Maximize"
max_jobs = 5
max_parallel_jobs = 1

In [12]:
tuner = HyperparameterTuner(
    estimator=img_classifier_model,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    objective_type=objective_type,
    max_jobs=max_jobs,
    max_parallel_jobs=max_parallel_jobs,
)

In [13]:
model_inputs = {
    "train": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/train/",
        content_type="application/x-large"
    ),
    # SageMarket calls refers to 'test' as 'validation'
     "validation": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/validation/",
        content_type="application/x-large"
    ),
    "train_lst": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/train.lst",
        content_type="application/x-large"
    ),
    "validation_lst": sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{bucket}/test.lst",
        content_type="application/x-large"
    ),
}


In [14]:
job_name_prefix = "classifier"
timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
job_name = job_name_prefix = timestamp

In [15]:
tuner.fit(inputs=model_inputs, job_name=job_name, logs=True)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


ValueError: instance_count and instance_type must be set if instance_groups is not set