# Model Training Notebook
- In this notebook, we will train an image classifier on the vehicle data we extracted in the previous notebook
- We will use Sagemaker's built in image classifer

## Import Packages

In [1]:
from sagemaker import image_uris
import boto3
import pandas as pd
import sagemaker
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

from sagemaker import get_execution_role
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.predictor import Predictor
from sagemaker.serializers import IdentitySerializer
import base64
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker import image_uris, Model



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Transform data to correct format for training
We need a .lst file (tab separated metadata file) to tell sagemaker, which image to use, where its located and what label it corresponds to. The image classification algorithm expects data in this format

In [36]:
# Define your custom label mapping
vehicle_label_map = {
    8: 0,
    13: 1,
    48: 2,
    58: 3,
    85: 4,
    89: 5
}

def to_metadata_file(df, prefix):
    df["s3_path"] = df["filenames"]

    # Apply the label mapping
    df["labels"] = df["labels"].apply(lambda x: vehicle_label_map.get(x, -1))

    # Save as .lst file for SageMaker
    return df[["row", "labels", "s3_path"]].to_csv(
        f"{prefix}.lst", sep="\t", index=False, header=False
    )


In [37]:
df_train = pd.read_csv("../Data/df_train.csv")
df_test = pd.read_csv("../Data/df_test.csv")

to_metadata_file(df_train.copy(), "../Data/train")
to_metadata_file(df_test.copy(), "../Data/test")

In [38]:
bucket = "sagemaker-us-east-1-351669278598"
region='us-east-1'

In [39]:
# Upload files

boto3.Session().resource('s3').Bucket(
    bucket).Object('vehicle_data/train.lst').upload_file('../Data/train.lst')
boto3.Session().resource('s3').Bucket(
    bucket).Object('vehicle_data/test.lst').upload_file('../Data/test.lst')

## Configure Training parameters

In [40]:
# Use the image_uris function to retrieve the latest 'image-classification' image 
algo_image = image_uris.retrieve(
    region=region,
    framework='image-classification',
    version='latest'
)
s3_output_location = f"s3://{bucket}/models/image_model" # this is where model artifacts are saved

In [41]:
role = get_execution_role()

img_classifier_model=sagemaker.estimator.Estimator(
    image_uri=algo_image,
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session()
)

In [42]:
img_classifier_model.set_hyperparameters(
    image_shape= "3,32,32",
    num_classes= 6,
    num_training_samples= len(df_train)
)

In [43]:
model_inputs = {
        "train": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/vehicle_data/train/",
            content_type="application/x-image"
        ),
        "validation": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/vehicle_data/test/",
            content_type="application/x-image"
        ),
        "train_lst": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/vehicle_data/train.lst",
            content_type="application/x-image"
        ),
        "validation_lst": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{bucket}/vehicle_data/test.lst",
            content_type="application/x-image"
        )
}

In [44]:
img_classifier_model.fit(model_inputs)

2025-04-07 15:46:14 Starting - Starting the training job
...........15:46:14 Pending - Training job waiting for capacity.
..25-04-07 15:47:53 Pending - Preparing the instances for training.
..25-04-07 15:48:24 Downloading - Downloading input data.
..............48:55 Downloading - Downloading the training image.
[34mDocker entrypoint called with argument(s): train[0mmpleted. Training in progress..
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mMon Apr  7 15:51:51 2025       [0m
[34m+-----------------------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |[0m
[34m|-----------------------------------------+------------------------+----------------------+[0m
[34m| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |

## Deploy model/ Can redploy model without retraining
- dont forget to delete endpoint if not using
- if redeploying, edit the endpoint address in the lambda classify function

In [2]:
bucket = 'sagemaker-us-east-1-351669278598'
role = get_execution_role()
region = 'us-east-1'

# find model tar file in s3 bucket of previously trained model
model_path = 's3://sagemaker-us-east-1-351669278598/models/image_model/image-classification-2025-04-07-15-46-13-797/output/model.tar.gz'

# Image URI for SageMaker's image classification algorithm
algo_image = image_uris.retrieve(
    region=region,
    framework='image-classification',
    version='latest'
)

# Create the Model object
img_classifier_model = Model(
    image_uri=algo_image,
    model_data=model_path,
    role=role
)

In [3]:
capture_uri = f's3://{bucket}/data-capture'

data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=capture_uri
)

In [4]:
capture_uri

's3://sagemaker-us-east-1-351669278598/data-capture'

In [5]:
# Deploy the model
predictor = img_classifier_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name='vehicle-detector-endpoint-10-04-25', # ensure to give unique name
    data_capture_config=data_capture_config,
    wait=True  # waits for deployment to finish
)

--------!

## Test inference through endpoint

In [None]:
endpoint_name = ''

In [None]:
from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint)

In [122]:
# serialise image for inference
predictor.serializer = IdentitySerializer("image/png")
with open("../Data/test/bike_s_000694.png", "rb") as f:
    payload = f.read()

    
inference = predictor.predict(payload)

In [123]:
inference = inference.decode("utf-8")

In [124]:
print(inference)

'[0.9389722347259521, 0.0002591302036307752, 0.0033516166731715202, 5.067536676506279e-06, 0.05736708268523216, 4.493407323025167e-05]'

In [125]:
import json

# need to convert string into list
raw = inference

# Convert it to list
if isinstance(raw, str):
    inference = json.loads(raw)
else:
    inference = raw

In [126]:
vehicle_index = inference.index(max(inference))

index_to_vehicle = {
0: "bicycle",
1: "bus",
2: "motorcycle",
3: "pickup_truck",
4: "tractor",
5: "tank"
}
vehicle = index_to_vehicle[vehicle_index]

In [127]:
print(vehicle, max(inference)

'bicycle'