# Custom Training with TensorFlow in Sagemaker

# Download Data

In [29]:
%matplotlib inline  # For Jupyter notebooks, ensures plots display directly inline

# Standard library imports
import os  # For interacting with the file system
import tarfile  # For extracting .tar.gz files
import urllib  # For downloading files from the web
import shutil  # High-level file operations (copying, moving, deleting)
import json  # For working with JSON data format
import random  # For generating random numbers (useful for shuffling data)

# Numerical libraries
import numpy as np  # The workhorse for numerical operations in Python
import tensorflow as tf  # The deep learning framework we'll be using
import sagemaker  # Amazon SageMaker SDK for machine learning tasks

# Image processing library
from PIL import Image  # Python Imaging Library (PIL) for image manipulation
from matplotlib import pyplot as plt  # For plotting images and visualizations

# URLs for the Oxford-IIIT Pet Dataset
urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz', 'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']

print('Libraries imported')  # A simple confirmation message

UsageError: unrecognized arguments: # For Jupyter notebooks, ensures plots display directly inline


In [3]:
def download_and_extract(data_dir, download_dir):
    """Downloads and extracts the dataset from the provided URLs.
    
    Args:
        data_dir: The directory where the extracted data should be stored.
        download_dir: The directory where the downloaded files will be temporarily saved.
    """
    for url in urls:
        target_file = url.split('/')[-1]  # Get the filename from the URL (e.g., 'images.tar.gz')
        
        if target_file not in os.listdir(download_dir):  # Check if already downloaded
            print('Downloading', url)
            urllib.request.urlretrieve(url, os.path.join(download_dir, target_file))  # Download file
            tf = tarfile.open(os.path.join(download_dir, target_file))  # Open tar file
            tf.extractall(data_dir)  # Extract to data_dir
        else:
            print('Already downloaded', url)

def get_annotations(file_path, annotations={}):
    """Reads annotations file and extracts image names and labels.

    Args:
        file_path: Path to the annotations file.
        annotations: An optional dictionary to store annotations (image_name: label).

    Returns:
        The updated annotations dictionary.
    """
    with open(file_path, 'r') as f:
        rows = f.read().splitlines()  # Read file and split into lines

    for i, row in enumerate(rows):
        image_name, _, _, _ = row.split(' ')  # Split line by space and take first 4 values
        class_name = image_name.split('_')[:-1]  # Get class name by splitting on '_' and taking all but last
        class_name = '_'.join(class_name)  # Join class name parts back together
        image_name = image_name + '.jpg'  # Add .jpg extension to image name

        # Determine label based on class name
        annotations[image_name] = 'cat' if class_name[0] != class_name[0].lower() else 'dog'

    return annotations

In [4]:
if not os.path.isdir('data'):
    os.mkdir('data')  # Create a directory named 'data' if it doesn't exist

download_and_extract('data', '.')  # Download and extract data to the 'data' directory

Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
Downloading http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz


# Dataset for Training

In [55]:
annotations = get_annotations('data/annotations/trainval.txt')  # Get annotations from trainval set
annotations = get_annotations('data/annotations/test.txt', annotations)  # Add annotations from test set

total_count = len(annotations.keys())  # Calculate the total number of annotated images
print('Total examples', total_count)  # Print the total count


Total examples 7349


In [56]:
next(iter(annotations.items()))

('Abyssinian_100.jpg', 'cat')

In [57]:
classes = ['cat', 'dog']  # Define the classes for your image classification task
sets = ['train', 'validation']  # Define the different sets of data (train and validation)
root_dir = 'custom_data'  # Define the root directory for your custom dataset

if not os.path.isdir(root_dir):
    os.mkdir(root_dir)  # Create the root directory if it doesn't exist

for set_name in sets:  # Loop through the sets (train and validation)
    if not os.path.isdir(os.path.join(root_dir, set_name)):
        os.mkdir(os.path.join(root_dir, set_name))  # Create a directory for the current set

    for class_name in classes:  # Loop through the classes (cat and dog)
        folder = os.path.join(root_dir, set_name, class_name)  
        if not os.path.isdir(folder):
            os.mkdir(folder)  # Create a directory for the current class within the current set

Copy the files to correct set/ class folders

In [58]:
for image, class_name in annotations.items():
    # Iterate through each image filename (key) and its corresponding class label (value) in the `annotations` dictionary.
    
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    # Assign the image to either 'validation' or 'train' set:
    #   - `random.randint(0, 99)` generates a random integer between 0 (inclusive) and 99 (inclusive).
    #   - If this random number is less than 20 (a 20% probability), the `target_set` is set to 'validation'.
    #   - Otherwise (an 80% probability), `target_set` is set to 'train'.
    #   - This effectively splits the data into roughly 80% for training and 20% for validation.

    target_path = os.path.join(root_dir, target_set, class_name, image)
    # Construct the full destination path for the image:
    #   - `root_dir`: The base directory where you're storing the custom dataset (e.g., 'custom_data').
    #   - `target_set`: The set this image belongs to ('train' or 'validation').
    #   - `class_name`: The class label of the image ('cat' or 'dog').
    #   - `image`: The original filename of the image.
    #   - The final path might look like 'custom_data/train/cat/Abyssinian_100.jpg'.

    shutil.copy(os.path.join('data/images/', image), target_path)
    # Copy the image file to its new location:
    #   - `os.path.join('data/images/', image)`: Forms the source path of the image in the original dataset.
    #   - `target_path`: The destination path calculated in the previous step.
    #   - `shutil.copy()` is a function that efficiently copies the file, preserving its contents and metadata.

In [59]:
sets_counts = {
    'train': 0,
    'validation': 0
}  # Initialize a dictionary to store image counts for each set

for set_name in sets:  # Loop through the sets (train and validation)
    for class_name in classes:  # Loop through the classes (cat and dog)
        path = os.path.join(root_dir, set_name, class_name)  # Construct the path to the current class folder
        count = len(os.listdir(path))  # Count the number of images in the folder
        print(path, 'has', count, 'images')  # Print the count for each folder
        sets_counts[set_name] += count  # Update the total count for the current set

print(sets_counts)  # Print the final counts for each set

custom_data/train/cat has 2279 images
custom_data/train/dog has 4748 images
custom_data/validation/cat has 835 images
custom_data/validation/dog has 1811 images
{'train': 7027, 'validation': 2646}


# Training Script - Create Model

In [69]:
# I used two methods to create train.py file - with open and append - inline magic


#!pip install ipython 

#%%writefile train.py  # Magic command to create a Python file named 'train.py'

#import tensorflow as tf
#import argparse  # For parsing command-line arguments
#import os
#import json

#def create_model():
   # """Creates a MobileNetV2-based image classification model."""

    #model = tf.keras.models.Sequential([
        #tf.keras.applications.mobilnet_v2.MobileNetV2(
            #include_top=False,       # Exclude the original classifier
            #pooling='avg',           # Use average pooling for feature extraction
            #weights='imagenet',       # Load pre-trained weights on ImageNet
            #input_shape=(128, 128, 3) # Resize input images to 128x128 with 3 color channels
        #),
        #tf.keras.layers.Dropout(0.5),      # Add dropout for regularization (prevents overfitting)
        #tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
    #])

    #model.layers[0].trainable = False  # Freeze the pre-trained MobileNetV2 layers
    #model.compile(
        #loss='binary_crossentropy',  # Loss function for binary classification
        #optimizer='adam',             # Optimization algorithm
        #metrics=['accuracy', 'precision', 'recall']  # Metrics to track during training
    #)

    #return model
    
# train.py
with open('train.py', 'w') as f:  # Open train.py in write mode
    f.write("""
import tensorflow as tf
import argparse
import os

def build_model():
    \"\"\"Constructs a MobileNetV2-based image classification model.\"\"\"

    base_model = tf.keras.applications.MobileNetV2(
        input_shape=(128, 128, 3),
        include_top=False,       # Exclude the original classifier
        weights='imagenet'       # Load pre-trained weights on ImageNet
    )
    base_model.trainable = False

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),  
        tf.keras.layers.Dropout(0.5),              
        tf.keras.layers.Dense(1, activation='sigmoid') 
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy']
    )

    return model
""")

# Training Script - Data Generators

In [16]:
%%writefile -a train.py

def create_data_generators(root_dir, batch_size):
    """Creates data generators for training and validation sets.

    This function takes the root directory of your custom dataset (containing 'train' and 'validation' folders) and a batch size as input. It returns two Keras ImageDataGenerator instances: one for training and one for validation.

    Args:
        root_dir: The base directory of your custom dataset (e.g., 'custom_data').
        batch_size: The number of images to process in each batch during training.

    Returns:
        train_data_generator: A Keras ImageDataGenerator for the training set.
        val_data_generator: A Keras ImageDataGenerator for the validation set.
    """

    train_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        horizontal_flip=True,
        zoom_range=[0.8, 1.2],
        rotation_range=20
    ).flow_from_directory(
        os.path.join(root_dir, 'train'),
        target_size=(128, 128),
        batch_size=batch_size,
        class_mode='binary'
    )
    # 1. Training Data Generator:
    #   - ImageDataGenerator is used to load images from the 'train' directory.
    #   - preprocessing_function: Applies MobileNetV2's preprocessing function (scaling, normalization) to each image.
    #   - horizontal_flip=True: Randomly flips some images horizontally (data augmentation).
    #   - zoom_range=[0.8, 1.2]: Randomly zooms images in or out (data augmentation).
    #   - rotation_range=20: Randomly rotates images (data augmentation).
    #   - flow_from_directory:
    #       - Reads images from subfolders in the 'train' directory (each subfolder represents a class).
    #       - Automatically labels the images based on their subfolder.
    #       - Resizes all images to 128x128 pixels.
    #       - Creates batches of 'batch_size' images for efficient training.
    #       - Sets class_mode to 'binary' for cat vs. dog classification.


    val_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    ).flow_from_directory(
        os.path.join(root_dir, 'validation'),
        target_size=(128, 128),
        batch_size=batch_size,
        class_mode='binary'
    )
    # 2. Validation Data Generator:
    #   - Similar to the training generator, but without data augmentation (to assess model performance on original images).
    #   - Loads images from the 'validation' directory.
    #   - Applies the same preprocessing as the training set.

    return train_data_generator, val_data_generator 
    # Returns both generators for use in training and validation.

In [70]:
with open('train.py', 'a') as f:  # Append to train.py
    f.write("""
def create_data_loaders(train_dir, val_dir, batch_size=32, image_size=(128, 128)):
    \"\"\"Creates training and validation data loaders.\"\"\"

    train_ds = tf.keras.utils.image_dataset_from_directory(
        train_dir,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='binary'  # For binary classification
    )

    val_ds = tf.keras.utils.image_dataset_from_directory(
        val_dir,
        image_size=image_size,
        batch_size=batch_size,
        label_mode='binary'
    )

    return train_ds, val_ds

# Add preprocessing and augmentation here if needed
""")

# Training Script - Putting it Together

In [17]:

%%writefile -a train.py

if __name__ == '__main__':
    """Main function for training the model."""

    parser = argparse.ArgumentParser()  # Create an argument parser

    parser.add_argument('--epochs', type=int, default=3)            # Add argument for epochs (default: 3)
    parser.add_argument('--batch_size', type=int, default=16)       # Add argument for batch size (default: 16)
    parser.add_argument('--steps', type=int, default=int(5865/16))  # Add argument for steps per epoch (default: calculated)
    parser.add_argument('--val_steps', type=int, default=int(1484/16)) # Add argument for validation steps (default: calculated)

    parser.add_argument('--model_dir', type=str)                 # Add argument for local model directory
    parser.add_argument('--sm_model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))  # Add argument for SageMaker model directory
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))  # Add argument for training data directory

    args, _ = parser.parse_known_args()    # Parse arguments and ignore unknowns 

In [71]:
# Append to existing train.py
with open('train.py', 'a') as f:  # Append to train.py
    f.write("""

if __name__ == '__main__':
    \"\"\"Main function for training the model.\"\"\"

    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--val', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))

    args = parser.parse_args()

    train_ds, val_ds = create_data_loaders(args.train, args.val, args.batch_size)

    # Train the model:
    model = build_model()
    
    # Callback to stop training when validation loss doesn't improve
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(train_ds,
        epochs=args.epochs,
        validation_data=val_ds,
        callbacks=[early_stopping]
    )
    
    # Save the model locally in SavedModel format for SageMaker:
    model.save(os.path.join(args.model_dir, 'model')) 
""")

# Upload Dataset to S3

In [18]:
sess = sagemaker.Session()               # Create a SageMaker session
role = sagemaker.get_execution_role()     # Get the IAM role for SageMaker
bucket_name = 'imagedatatf'              # Name of your S3 bucket

print('Uploading data to S3')
s3_data_path = sess.upload_data(path=root_dir, bucket=bucket_name, key_prefix='data')  # Corrected typo: get_execution_role
print('Uploaded to', s3_data_path)

Uploading data to S3
Uploaded to s3://imagedatatf/data


# Train with TensorFlow Estimator

In [72]:
from sagemaker.tensorflow import TensorFlow

# Replace 'your-iam-role-arn' with your actual IAM role ARN
role = sagemaker.get_execution_role()     

pets_estimator = TensorFlow(
    entry_point='train.py',      
    role=role,                     
    instance_type='ml.p3.2xlarge',   # Or choose another appropriate instance type
    instance_count=1,             
    framework_version='2.12',     
    py_version='py310',            # Specify Python version
    output_path='s3://imagedatatf'  # Bucket name directly included
)

In [None]:
# fit and train the model 

pets_estimator.fit(s3_data_path)

# Deploy TensorFlow Model

In [None]:
pets_predictor = pets_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
print('\nModel is deployed')

# Final Predictions

In [None]:
cat_dir = 'custom_data/validation/cat/'
cat_images = [os.path.join(cat_dir, x) for x in os.listdir(cat_dir)]
print(cat_images[0])

dog_dir = 'custom_data/validation/dog/'
dog_images = [os.path.join(dog_dir, x) for x in os.listdir(dog_dir)]
print(dog_images[0])

In [None]:
def get_pred(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(128, 128))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    results = pets_predictor.predict(img)
    return results

In [None]:
image_path = cat_images[0]
results = get_pred(image_path)

print(results)

In [None]:
class_id = int(np.squeeze(results['predictions']) > 0.5)
print('Predicted class_id:', class_id, 'with class_name:', classes[class_id])

# Delete Model Endpoint

In [None]:
sagemaker_session.delete_endpoint(pets_predictor.endpoint)