In [7]:
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch

boto_session = boto3.Session()
region = boto_session.region_name

sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

s3_client = boto3.client('s3')

train_data_path = "s3://wildfires/data/train/"

# MLFLow
tracking_server_arn = 'arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj'
experiment_name = 'wildfire-classification'

bucket = 'wildfires'
prefix = 'models'

In [8]:
%%writefile requirements.txt
mlflow==2.13.2
torchinfo
sagemaker-mlflow==0.1.0

Overwriting requirements.txt


In [9]:
run_name = 'train-resnet-fire'

new_estimator = PyTorch(
    entry_point='training.py',
    role=sm_role,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    input_mode='File',
    py_version="py39",
    framework_version="1.13",
    environment={
        'MLFLOW_TRACKING_URIs': tracking_server_arn,
        'MLFLOW_EXPERIMENT_NAME': experiment_name,
    },
    dependencies=['requirements.txt'],
    hyperparameters={
        'num-epochs': 3,
        'batch-size': 32,
        'learning-rate': 0.1,
        'run-name': run_name,
        'bucket': bucket,
        'region': region,
        'seed': 42
    },
    output_path=f's3://{bucket}/{prefix}/'
)

# Fit the estimator
new_estimator.fit({'train': train_data_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-06-27-17-21-55-552


2024-06-27 17:21:55 Starting - Starting the training job...
2024-06-27 17:22:05 Pending - Training job waiting for capacity...
2024-06-27 17:22:29 Pending - Preparing the instances for training...
2024-06-27 17:23:10 Downloading - Downloading input data......
2024-06-27 17:24:10 Downloading - Downloading the training image.....................
2024-06-27 17:27:31 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-27 17:27:44,680 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-27 17:27:44,700 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-06-27 17:27:44,713 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-06-27 17:27:44,715 sagemaker_pytorch_container.training INF