# Image Classification Training, Deployment & Prediction Using the AWS SageMaker and Tensorflow

## An experiment on medical images

1. [Import Requirements and Set Up](#Import-Requirements-and-Set-Up)
1. [Prepare the Dataset](#Prepare-the-Dataset)
1. [Prepare Training Script](#Prepare-Training-Script)
1. [Train with TensorFlow Estimator](#Train-with-TensorFlow-Estimator)
1. [Deploy TensorFlow Model](#Deploy-TensorFlow-Model)
1. [Test and Prediction Task](#Test-and-Prediction-Task)
1. [Delete the Endpoint](#Delete-the-Endpoint)

## Import Requirements and Set Up

### Import the Required Modules
First, we'll need to load all the required modules. 

In [39]:
import sys
import boto3
import sagemaker 
from sagemaker.tensorflow import TensorFlow
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.s3 import S3Uploader
from sagemaker.s3 import S3Downloader
import random
import shutil

session = sagemaker.Session() 
region = boto3.Session().region_name
bucket = session.default_bucket() 


## Prepare the Dataset
This notebook uses a pretrained mobilenet model to save training time.

We use AWS S3 to store and manage our image data. In this notebook, we only use 1000 normal images and 1000 pneumonia images from the ChestXray2017 dataset.

In [7]:
project_name = "pn_deploy"

train_prefix = "train"
val_prefix = "validation"

train_data = "s3://{}/{}/{}/".format(bucket, project_name, train_prefix)
validation_data = "s3://{}/{}/{}/".format(bucket, project_name, val_prefix)


### Download or Update Data from S3

In [8]:
def get_file_list(bucket_name, prefix):
    s3 = boto3.resource('s3')
    bucket=bucket_name
    my_bucket = s3.Bucket(bucket)
    location_list = []
    for (bucket_name, key) in map(lambda x: (x.bucket_name, x.key), my_bucket.objects.filter(Prefix=prefix)):
        data_location = "s3://{}/{}".format(bucket_name, key)
        location_list.append(data_location)
    # Remove the root folder path
    if "s3://{}/{}/".format(bucket_name, prefix) in location_list:
        location_list.remove("s3://{}/{}/".format(bucket_name, prefix))
    return location_list


In [9]:
list_normal = get_file_list(bucket,"pn_deploy/normal_1000")
list_pneumonia = get_file_list(bucket,"pn_deploy/pneumonia_1000")


In [15]:
#download data
for l in list_normal:
    data_source1 = S3Downloader.download(
    local_path="/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/normal/",
    s3_uri=l,
    sagemaker_session=session,
    )

for l in list_pneumonia:
    data_source1 = S3Downloader.download(
    local_path="/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/pneumonia/",
    s3_uri=l,
    sagemaker_session=session,
    )


### Generate annotations

In [20]:
# generate annotations
import os
os.mkdir('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/annotations')

filePath = '/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/normal'
l = os.listdir(filePath)
ant={}
for n in l:
    name,_ = n.split('.')
    ant[name] = 'normal'
with open('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/annotations/normal.txt', 'w') as f:
    for n, c in ant.items():
        f.write(str(n)+" "+str(c)+"\n")

filePath = '/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/pneumonia'
l = os.listdir(filePath)
ant={}
for n in l:
    name,_ = n.split('.')
    ant[name] = 'pneumonia'
with open('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/annotations/pneumonia.txt', 'w') as f:
    for n, c in ant.items():
        f.write(str(n)+" "+str(c)+"\n")
        

In [21]:
# read annotations
def get_annotations(file_path, annotations={}):
    
    with open(file_path, 'r') as f:
        rows = f.read().splitlines()

    for i, row in enumerate(rows):
        image_name, class_name = row.split(' ')
        image_name = image_name + '.jpeg'
        
        annotations[image_name] = class_name
    
    return annotations


In [23]:
# read annotations
annotations_normal={}
annotations_pneumonia={}
annotations_normal = get_annotations('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/annotations/normal.txt',
                                     annotations_normal)
annotations_pneumonia = get_annotations('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/annotations/pneumonia.txt',
                                        annotations_pneumonia)

total_count = len(annotations_normal.keys())
print('Total normal examples', total_count)
total_count = len(annotations_pneumonia.keys())
print('Total pneumonia examples', total_count)


Total normal examples 1000
Total pneumonia examples 1000


In [24]:
print(next(iter(annotations_normal.items())))
print(next(iter(annotations_pneumonia.items())))


('NORMAL2-IM-0774-0001.jpeg', 'normal')
('person109_virus_203.jpeg', 'pneumonia')


### Split Data and Upload

In [25]:
# split and copy file
import os
classes = ['normal', 'pneumonia']
sets = ['train', 'validation']
root_dir = '/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/custom_data'

if not os.path.isdir(root_dir):
    os.mkdir(root_dir)
    
for set_name in sets:
    if not os.path.isdir(os.path.join(root_dir, set_name)):
        os.mkdir(os.path.join(root_dir, set_name))
    for class_name in classes:
        folder = os.path.join(root_dir, set_name, class_name)
        if not os.path.isdir(folder):
            os.mkdir(folder)
            

In [26]:
for image, class_name in annotations_normal.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/normal', image), target_path)

for image, class_name in annotations_pneumonia.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/data/pneumonia', image), target_path)
    

In [27]:
sets_counts = {
    'train': 0,
    'validation': 0
}

for set_name in sets:
    for class_name in classes:
        path = os.path.join(root_dir, set_name, class_name)
        count = len(os.listdir(path))
        print(path, 'has', count, 'images')
        sets_counts[set_name] += count

print(sets_counts)


/home/ec2-user/SageMaker/img_cls_exp/Medical Image/Pneumonia/custom_data/train/normal has 814 images
/home/ec2-user/SageMaker/img_cls_exp/Medical Image/Pneumonia/custom_data/train/pneumonia has 790 images
/home/ec2-user/SageMaker/img_cls_exp/Medical Image/Pneumonia/custom_data/validation/normal has 186 images
/home/ec2-user/SageMaker/img_cls_exp/Medical Image/Pneumonia/custom_data/validation/pneumonia has 210 images
{'train': 1604, 'validation': 396}


In [29]:
print('Uploading to S3..')
s3_data_path = session.upload_data(path=root_dir, bucket=bucket, key_prefix='pn_deploy')

print('Uploaded to', s3_data_path)


Uploading to S3..
Uploaded to s3://sagemaker-us-east-2-179199196742/pn_deploy


## Prepare Training Script

### Create Model

In [30]:
%%writefile train.py

import tensorflow as tf
import argparse
import os
import json

def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet',
                                                       pooling='avg', input_shape=(224, 224, 3)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.layers[0].trainable = True
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


Writing train.py


### Data Generators

In [31]:
%%writefile -a train.py

def create_data_generators(root_dir, batch_size):
    train_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        horizontal_flip=True,
        zoom_range=[0.8, 1.2],
        rotation_range=20
    ).flow_from_directory(
        os.path.join(root_dir, 'train'),
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='binary'
    )
    
    val_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    ).flow_from_directory(
        os.path.join(root_dir, 'validation'),
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='binary'
    )
    
    return train_data_generator, val_data_generator


Appending to train.py


### Putting it Together

In [32]:
%%writefile -a train.py

if __name__ =='__main__':

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--steps', type=int, default=int(1614/16))
    parser.add_argument('--val_steps', type=int, default=int(386/16))

    # input data and model directories
    parser.add_argument('--model-dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))

    args, _ = parser.parse_known_args()

    local_output_dir = args.sm_model_dir
    local_root_dir = args.train
    batch_size = args.batch_size
    
    model = create_model()
    train_gen, val_gen = create_data_generators(local_root_dir, batch_size)
    
    _ = model.fit(
        train_gen,
        epochs=args.epochs,
        steps_per_epoch=args.steps,
        validation_data=val_gen,
        validation_steps=args.val_steps
    )
    
    model.save(os.path.join(local_output_dir, 'model', '1'))
    

Appending to train.py


## Train with TensorFlow Estimator
You can check [SageMaker endpoints and quotas](https://docs.aws.amazon.com/general/latest/gr/sagemaker.html#limits_sagemaker) for sagemaker training and deployment.

In [33]:
role = sagemaker.get_execution_role()
estimator = TensorFlow(
    entry_point='train.py',
    role=role,
    instance_count=2,
    instance_type='ml.m5.4xlarge', # you can use any instance_type within your quotas
    framework_version='2.1.0',
    py_version='py3',
    output_path="s3://{}/{}".format(bucket, project_name),
)


In [87]:
# s3_data_path = 's3://sagemaker-us-east-2-179199196742/pn_deploy'
estimator.fit(s3_data_path)


## Deploy TensorFlow Model

In [35]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
print('\nModel Deployed!')

-------------!
Model Deployed!


In [89]:
## Test and Prediction

## Test Predictions

In [59]:
import os
list_normal_test = get_file_list(bucket, "pn_deploy/test/normal")
list_pneumonia_test = get_file_list(bucket, "pn_deploy/test/pneumonia")
if not os.path.isdir('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/test/'):
    os.mkdir('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/test/')
test = list_normal_test[0]
data_source = S3Downloader.download(local_path='/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/test/', s3_uri=test)
image_path = '/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/test/' + test[60:]
image_path

'/home/ec2-user/SageMaker/img_cls_exp/Medical_Image/Pneumonia/test/NORMAL2-IM-1436-0001.jpeg'

In [60]:
import tensorflow as tf
import numpy as np
def get_pred(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    results = predictor.predict(img)
    class_id = int(np.squeeze(results['predictions']) > 0.5)
    return classes[class_id]

In [61]:
get_pred(image_path)

'normal'

## Prediction Task
We create a folder in S3 bucket to store data to be predict, running the following cells will make predictions and send the output back to the same folder in S3 bucket and Github.

In [80]:
# task link and list
list_task = get_file_list(bucket, "pn_deploy/task/data")

In [81]:
for l in list_task:
    data_source = S3Downloader.download(
    local_path='/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/task/data/',
    s3_uri=l,
    )

In [88]:
image_path = []
for l in list_task:
    image_path.append('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/task/' + l[53:])
image_path

In [84]:
with open('/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/task/prediction_output.txt', 'w') as f:
    for i in image_path:
        f.write(i[70:] + " " + get_pred(i)+"\n")

In [85]:
print('Uploading to S3..')
s3_data_path = session.upload_data(path='/home/ec2-user/SageMaker/img_cls_exp/MedicalImage/Pneumonia/task/prediction_output.txt', bucket=bucket, key_prefix='pn_deploy/task/pred_output')
print('Uploaded to', s3_data_path)

Uploading to S3..
Uploaded to s3://sagemaker-us-east-2-179199196742/pn_deploy/task/pred_output/prediction_output.txt


## Delete the Endpoint
When you are done, make sure to clean up your AWS account by deleting resources you won't be reusing.

In [None]:
predictor.delete_endpoint()

---