In [23]:
import sagemaker
from sagemaker.estimator import Estimator
import boto3
import time
from time import gmtime, strftime

# #1 - Write model script

In [16]:
%%writefile train.py
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)  # Flatten the input tensor to a vector of size 784
        x = torch.relu(self.fc1(x))  # Apply first layer with ReLU activation
        x = self.fc2(x)  # Apply second layer (outputs logits for 10 classes)
        return x

# Training function
def train():
    # Download and prepare the MNIST dataset
    transform = transforms.Compose([transforms.ToTensor()])
    train_dataset = datasets.MNIST(root='/opt/ml/input/data/train', train=True, transform=transform, download=True)
    train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
    
    model = SimpleNN()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(10):
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1} complete')

    # Save the trained model
    torch.save(model.state_dict(), '/opt/ml/model/model.pth')

if __name__ == '__main__':
    train()

Overwriting train.py


# #2 - Write Dockerfile

In [17]:
%%writefile Dockerfile
FROM python:3.8

# Install dependencies
RUN pip install torch torchvision

# Set the working directory
WORKDIR /app

# Copy the training script into the container
COPY train.py .

# Command to run when the container starts
ENTRYPOINT ["python", "train.py"]

Overwriting Dockerfile


# #3 - Create and Register Docker Image

In [18]:
# Enable ECR Access
!$(aws ecr get-login --no-include-email --region us-east-2)

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [12]:
# Create ECR Repository
!aws ecr create-repository --repository-name my-custom-algorithm


An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'my-custom-algorithm' already exists in the registry with id '038462750455'


In [19]:
# Install Docker
!sudo yum update -y
!sudo amazon-linux-extras install docker -y
!sudo service docker start
!sudo usermod -a -G docker ec2-user

Loaded plugins: dkms-build-requires, extras_suggestions, kernel-livepatch,
              : langpacks, priorities, update-motd, versionlock
amzn2-core                                               | 3.6 kB     00:00     
https://download.docker.com/linux/centos/2/x86_64/stable/repodata/repomd.xml: [Errno 14] HTTPS Error 404 - Not Found
Trying other mirror.
63 packages excluded due to repository priority protections
No packages marked for update
Installing docker
Loaded plugins: dkms-build-requires, extras_suggestions, kernel-livepatch,
              : langpacks, priorities, update-motd, versionlock
Cleaning repos: amzn2-core amzn2extra-docker amzn2extra-kernel-5.10
              : amzn2extra-livepatch amzn2extra-lustre amzn2extra-python3.8
              : centos-extras
              : copr:copr.fedorainfracloud.org:vbatts:shadow-utils-newxidmap
              : docker-ce-stable nvidia-container-toolkit
34 metadata files removed
17 sqlite files removed
0 metadata files removed
Loaded plug

In [20]:
# Get the AWS account ID and region
sts = boto3.client("sts")
account_id = sts.get_caller_identity()["Account"]
region = boto3.Session().region_name

# Define the repository name and ECR URI
repository_name = "my-custom-algorithm"
ecr_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{repository_name}"

# Docker commands to build, tag, and push the Docker image
!docker build -t my-custom-algorithm .
!docker tag my-custom-algorithm:latest {ecr_uri}:latest
!docker push {ecr_uri}:latest


[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 296B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.8              0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.2s (2/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 296B                                       0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.8              0.2s
[0m[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (9/9) FINISHED                                 docker:default
[34m => [internal] load build definition from Docker

# #4 - Train Custom Model

In [24]:
# Specify variables
bucket_name = "training-models-on-amazon-sagemaker"
prefix = 'mnist/simpleNN'
bucket_path = f"s3://{bucket_name}/{prefix}"

# Get AWS account and region dynamically
account_id = boto3.client("sts").get_caller_identity()["Account"]
region = boto3.Session().region_name

# ECR repository details (the updated custom container)
repository_name = "my-custom-algorithm"

role = sagemaker.get_execution_role()

# Define the custom Estimator using the updated ECR image
custom_estimator = Estimator(
    image_uri=f"{account_id}.dkr.ecr.{region}.amazonaws.com/{repository_name}:latest",
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=bucket_path,  # Output path for the trained model
)

# Start the training job
training_job_name = f'simpleNN-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
custom_estimator.fit(job_name=training_job_name)

# Fetch training job details
sm_client = boto3.client('sagemaker')

# Retrieve the training job description
training_info = sm_client.describe_training_job(TrainingJobName=training_job_name)

# Print the S3 location where the model is stored
model_s3_uri = training_info['ModelArtifacts']['S3ModelArtifacts']
print(f"Model saved at: {model_s3_uri}")

INFO:sagemaker:Creating training-job with name: simpleNN-2024-10-22-02-51-32


2024-10-22 02:51:35 Starting - Starting the training job...
2024-10-22 02:51:48 Starting - Preparing the instances for training...
2024-10-22 02:52:33 Downloading - Downloading the training image..................
2024-10-22 02:55:15 Training - Training image download completed. Training in progress.[34m#0150.3%#0150.7%#0151.0%#0151.3%#0151.7%#0152.0%#0152.3%#0152.6%#0153.0%#0153.3%#0153.6%#0154.0%#0154.3%#0154.6%#0155.0%#0155.3%#0155.6%#0156.0%#0156.3%#0156.6%#0156.9%#0157.3%#0157.6%#0157.9%#0158.3%#0158.6%#0158.9%#0159.3%#0159.6%#0159.9%#01510.2%#01510.6%#01510.9%#01511.2%#01511.6%#01511.9%#01512.2%#01512.6%#01512.9%#01513.2%#01513.6%#01513.9%#01514.2%#01514.5%#01514.9%#01515.2%#01515.5%#01515.9%#01516.2%#01516.5%#01516.9%#01517.2%#01517.5%#01517.9%#01518.2%#01518.5%#01518.8%#01519.2%#01519.5%#01519.8%#01520.2%#01520.5%#01520.8%#01521.2%#01521.5%#01521.8%#01522.1%#01522.5%#01522.8%#01523.1%#01523.5%#01523.8%#01524.1%#01524.5%#01524.8%#01525.1%#01525.5%#01525.8%#01526.1%#01526.4%#015