# **Preprocessing**
* Container: codna_pytorch_p310

## AutoReload

In [2]:
%load_ext autoreload
%autoreload 2

## 1. Processing-job for preprocessing

In [3]:
import os
import boto3
import sagemaker
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor




## 2. parameter store 설정

In [4]:
from utils.ssm import parameter_store

In [5]:
region=boto3.Session().region_name
pm = parameter_store(region)
prefix = pm.get_params(key="PREFIX")

* Params for processing job

In [6]:
local_mode = False

if local_mode: 
    instance_type = 'local'
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    data_path = os.path.join(os.getcwd(), "data")
    
else:
    instance_type = "ml.m5.xlarge" ## "ml.g4dn.xlarge"
    sagemaker_session = sagemaker.Session()
    data_path = pm.get_params(key="-".join([prefix, "DATA-PATH-S3"]))
    
git_config = {
    'repo': f'https://{pm.get_params(key="-".join([prefix, "CODE_REPO"]))}',
    'branch': 'main',
    'username': pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=True),
    'password': pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=True)
}

role = pm.get_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]))
bucket_name = pm.get_params(key="-".join([prefix, "BUCKET-NAME"]))
    
print (f'instance-type: {instance_type}')
print (f'role: {role}')
print (f'bucket: {bucket_name}')
print (f'dataset-path: {data_path}')
print (f'sagemaker_session: {sagemaker_session}')
print (f'git_config: {git_config}')

instance-type: ml.m5.xlarge
role: AmazonSageMaker-ExecutionRole-20221004T162466
bucket: sm-bert-ramp
dataset-path: s3://sm-bert-ramp/ramp-mlops/data/amazon_polarity.csv
sagemaker_session: <sagemaker.session.Session object at 0x7f6fd50979a0>
git_config: {'repo': 'https://git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code', 'branch': 'main', 'username': 'dongjin-at-419974056037', 'password': 'n1h2OES6ZiHws5kGNt0TJxtoLaAGxjLkOxtmlzc5YWg='}


* Define processing job

In [7]:
dataset_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version="2.0.0",
    py_version='py310',
    instance_type=instance_type,
    instance_count=1,
    role=role,
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    sagemaker_session=sagemaker_session
)

proc_prefix = "/opt/ml/processing"

output_path = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "preprocessing",
    "data"
)

In [8]:
dataset_processor.run(
    #job_name="preprocessing", ## 이걸 넣어야 캐시가 작동함, 안그러면 프로세서의 base_job_name 이름뒤에 날짜 시간이 붙어서 캐시 동작 안함
    code='preprocessing.py', #소스 디렉토리 안에서 파일 path
    source_dir= "./code", #현재 파일에서 소스 디렉토리 상대경로 # add processing.py and requirements.txt here
    git_config=git_config,
    inputs=[
        ProcessingInput(
            input_name="input-data",
            source=data_path,
            destination=os.path.join(proc_prefix, "input")
        ),
    ],
    outputs=[       
        ProcessingOutput(
            output_name="output-data",
            source=os.path.join(proc_prefix, "output"),
            destination=output_path
        ),
    ],
    arguments=[
        "--proc_prefix", proc_prefix,
        "--split_rate", "0.8"
    ]
)

Cloning into '/tmp/tmp44p0e45c'...
remote: Counting objects: 9, done.        
Already on 'main'


Your branch is up to date with 'origin/main'.
Using provided s3_resource


INFO:sagemaker:Creating processing-job with name preprocessing-2023-08-07-05-35-48-202


[34mCollecting accelerate>=0.20.3 (from -r requirements.txt (line 1))
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 244.2/244.2 kB 11.9 MB/s eta 0:00:00[0m
[34mCollecting transformers==4.31.0 (from -r requirements.txt (line 2))
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.4/7.4 MB 40.1 MB/s eta 0:00:00[0m
[34mCollecting datasets[s3]==1.18.4 (from -r requirements.txt (line 3))
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 312.1/312.1 kB 45.5 MB/s eta 0:00:00[0m
[34mCollecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0->-r requirements.txt (line 2))
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 43.2 MB/s eta 0:00:00[0m
[34mCollecting regex!=2019.12.17 (from transformers==4.31.0->-r requirements.txt (line 2))
  Downloading

In [9]:
!aws s3 sync $output_path ./data/preprocessing --quiet
output_path

's3://sm-bert-ramp/ramp-mlops/preprocessing/data'

## 3. parameter store에 Processing output 추가

In [10]:
pm.put_params(key="-".join([prefix, "PREP-DATA-PATH"]), value=output_path, overwrite=True)

'Store suceess'