# <B> Preprocessing </B>
* Container: codna_python3

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 1. parameter store 셋팅

In [2]:
import boto3
from utils.ssm import parameter_store

In [3]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
prefix = pm.get_params(key="PREFIX")

## 2.Processing-job for preprocessing

In [5]:
import os
import wget
import sagemaker
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor

In [6]:
os.path.join(os.getcwd(), "data")

'/home/ec2-user/SageMaker/nemo-on-sagemaker/data'

* params for processing job

In [7]:
local_mode = False

if local_mode: 
    instance_type = 'local'
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session.config = {'local': {'local_code': True}}
    data_path = os.path.join(os.getcwd(), "data")
    
else:
    instance_type = "ml.m5.xlarge"
    sagemaker_session = sagemaker.Session()
    data_path = pm.get_params(key=prefix + 'S3-DATA-PATH')
    
print (f"instance-type: {instance_type}")
print (f"image-uri: {pm.get_params(key=''.join([prefix, 'IMAGE-URI']))}")
print (f"role: {pm.get_params(key=prefix + 'SAGEMAKER-ROLE-ARN')}")
print (f"bucket: {pm.get_params(key=prefix + 'BUCKET')}")
print (f"dataset-path: {data_path}")
print (f"sagemaker_session: {sagemaker_session}")

instance-type: ml.m5.xlarge
image-uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-image:latest
role: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436
bucket: sm-nemo-bucket
dataset-path: s3://sm-nemo-bucket/data
sagemaker_session: <sagemaker.session.Session object at 0x7f0b14e279a0>


* Define processing job

In [24]:
dataset_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version=None,
    image_uri=pm.get_params(key=''.join([prefix, "IMAGE-URI"])),
    instance_type=instance_type,
    instance_count=1,
    role=pm.get_params(key=prefix + "SAGEMAKER-ROLE-ARN"),
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    sagemaker_session=sagemaker_session
)

proc_prefix = "/opt/ml/processing"

output_path = os.path.join(
    "s3://{}".format(pm.get_params(key=prefix + "BUCKET")),
    "preprocessing",
    "data"
)

In [25]:
dataset_processor.run(
    #job_name="preprocessing", ## 이걸 넣어야 캐시가 작동함, 안그러면 프로세서의 base_job_name 이름뒤에 날짜 시간이 붙어서 캐시 동작 안함
    code='./preprocessing.py', #소스 디렉토리 안에서 파일 path
    source_dir="./an4_nemo_sagemaker/code/preprocessing/", #현재 파일에서 소스 디렉토리 상대경로 # add processing.py and requirements.txt here
    inputs=[
        ProcessingInput(
            input_name="input-data",
            source=data_path,
            destination=os.path.join(proc_prefix, "input")
        ),
    ],
    outputs=[       
        ProcessingOutput(
            output_name="output-data",
            source=os.path.join(proc_prefix, "output"),
            destination=output_path
        ),
    ],
    arguments=["--proc_prefix", proc_prefix, "--region", strRegionName , \
               "--train_mount_dir", "/opt/ml/input/data/training/", \
               "--test_mount_dir", "/opt/ml/input/data/testing/"],
)

INFO:sagemaker.processing:Uploaded ./an4_nemo_sagemaker/code/preprocessing/ to s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-03-16-09-22-00-721/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-ap-northeast-2-419974056037/preprocessing-2023-03-16-09-22-00-721/source/runproc.sh
INFO:sagemaker:Creating processing-job with name preprocessing-2023-03-16-09-22-00-721


.............................................[34mReceived arguments Namespace(proc_prefix='/opt/ml/processing', region='ap-northeast-2', train_mount_dir='/opt/ml/input/data/training/', test_mount_dir='/opt/ml/input/data/testing/')[0m
[34mConverting .sph to .wav...[0m
[34mFinished conversion.[0m
[34m******[0m
[34m******[0m
[34mTraining manifest created.[0m
[34mTest manifest created.[0m
[34m***Done***[0m
[34mdata_dir ['entrypoint', 'code', 'an4'][0m
[34mself.output_dir ['an4'][0m



In [None]:
pm.put_params(key="".join([prefix, "PREP-DATA-PATH"]), value=output_path, overwrite=True)

In [14]:

from distutils.dir_util import copy_tree


In [18]:
pwd()

'/home/ec2-user/SageMaker/nemo-on-sagemaker'

In [26]:
os.path.dirname(os.path.realpath(__file__))

NameError: name '__file__' is not defined