# Prepare data with SageMaker Processing

## Setup environment

In [None]:
import sagemaker
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role() # we are using the notebook instance role for training in this example
bucket = sagemaker_session.default_bucket() # you can specify a bucket name here

## Get data

In [None]:
!wget https://aws-mlops-workshop.s3-eu-west-1.amazonaws.com/reviews/workshop_data/reviews.csv

In [None]:
prefix = 'data/input'
s3_input = sagemaker_session.upload_data('reviews.csv', bucket, prefix)
print(s3_input)

## Build and push container

In [None]:
image_name = 'data-processing-containers'

In [None]:
!sh ./docker/build_and_push.sh $image_name

In [None]:
# Replace by your ECR image ID
container = '<your-container-image-uri>' 

## Launch data processing job

In [None]:
data_processor = Processor(role=role, 
                           image_uri=container, 
                           instance_count=1, 
                           instance_type='ml.m5.xlarge',
                           volume_size_in_gb=30, 
                           max_runtime_in_seconds=1200,
                           base_job_name='data-processing')

In [None]:
input_folder = '/opt/ml/processing/input'
output_folder = '/opt/ml/processing/output'

data_processor.run(
    arguments= [
        f'--input={input_folder}',
        f'--output={output_folder}'
    ],
    inputs = [
        ProcessingInput(
            input_name='input',
            source=s3_input,
            destination=input_folder
        )
    ],
    outputs= [
        ProcessingOutput(
            output_name='preprocessed',
            source=output_folder,
            destination=bucket
        )
    ]
)