In [15]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

In [16]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name)

In [17]:
def create_trial(experiment_name):
    from time import strftime
    create_date = strftime("%m%d-%H%M%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [18]:
s3_client = boto3.client("s3")
sess = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
key_prefix = "gscaltex-data"

In [5]:
s3_data_location = f's3://{bucket}/{key_prefix}'
s3_data_location

's3://sagemaker-us-west-2-322537213286/gscaltex-data'

In [6]:
!aws s3 cp ./data/tmp_df.csv $s3_data_location/tmp_df.csv

upload: data/tmp_df.csv to s3://sagemaker-us-west-2-322537213286/gscaltex-data/tmp_df.csv


In [7]:
subnets=None
security_group_ids=None

In [20]:
experiment_name = 'caltex-poc-1'

training_instance_type='local'
# training_instance_type='ml.m5.xlarge'
# training_instance_type='ml.p3.2xlarge'

instance_count = 1
use_spot_instances = False
max_wait = None
max_run = 1*60*60

In [21]:
from pathlib import Path
if training_instance_type in ['local_gpu', 'local']:
    from sagemaker.local import LocalSession
    

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    input_path = f'file://{Path.cwd()}/data'
else:
    sagemaker_session = sagemaker.Session()
    input_path = s3_data_location

source_dir = f'{Path.cwd()}/1.train_code_caltex'

In [24]:
# Pytorch Image is used to enable distributed GPU training
estimator = PyTorch(
    source_dir=source_dir,
    entry_point="main_parallel.py",
    # entry_point="main_parallel.py",
    subnets=subnets,
    security_group_ids=security_group_ids,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=instance_count, 
    instance_type=training_instance_type,
    framework_version="1.12.1",
    volume_size=512,
    py_version="py38",
    disable_profiler=True,
    use_spot_instances=use_spot_instances,
    max_wait=max_wait,
    max_run=max_run,
)

In [25]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name)

estimator.fit(
    inputs={"train": input_path},
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: caltex-poc-1-0409-07481681026501
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-ibe70:
    command: train
    container_name: 31nbuxxicy-algo-1-ibe70
    deploy:
      resources:
        reservations:
          devices:
          - capabilities:
            - gpu
    environment:
    - '[Masked]'
  

Creating 31nbuxxicy-algo-1-ibe70 ... 
Creating 31nbuxxicy-algo-1-ibe70 ... done
Attaching to 31nbuxxicy-algo-1-ibe70
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,342 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,408 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,417 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,420 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,423 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36m31nbuxxicy-algo-1-ibe70 |[0m 2023-04-09 07:48:24,425 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36m31nbux

INFO:root:creating /tmp/tmpw6mpj0vu/artifacts/output/data
INFO:root:copying /tmp/tmpw6mpj0vu/algo-1-ibe70/output/success -> /tmp/tmpw6mpj0vu/artifacts/output
INFO:root:copying /tmp/tmpw6mpj0vu/model/save_model.pkl -> /tmp/tmpw6mpj0vu/artifacts/model


[36m31nbuxxicy-algo-1-ibe70 exited with code 0
[0mAborting on container exit...
===== Job Complete =====


In [None]:
estimator_gpu.logs()