# SageMaker Processing Jobs With Containers

The objective of this notebook is:
1) Run a processing job to run a scikit-learn script that cleans, pre-processes, performs feature engineering, and splits the input data into train and test sets.
2) Run a training job on the pre-processed training data to train a model
3) Run a processing job on the pre-processed test data to evaluate the trained model's performance
4) Use your own custom container to run processing jobs with your own Python libraries and dependencies.


Source:
https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb

#### Start Session

In [18]:
from sagemaker.session import Session
from sagemaker import get_execution_role
import sagemaker
import boto3

In [19]:
from sagemaker.sklearn.processing import SKLearnProcessor
import pandas as pd

In [20]:
AWS_ACCESS_KEY = 'abc'
AWS_SECRET = 'edfg'

region_name='us-east-2'

boto_session = boto3.session.Session(
   aws_access_key_id=AWS_ACCESS_KEY,
   aws_secret_access_key=AWS_SECRET,
   region_name=region_name
)

bucket = 'sm-amazon-nk'

sagemaker_session = Session(boto_session=boto_session)

model_package_group_name = f"AmazonModelPackageGroupName"

role = 'arn:aws:iam::013747046745:role/sagemaker-role-amazon'

#### Data Preprocessing and Feature Engineering

In [21]:
#instantiate sklearn processor
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)


In [23]:
input_data = 'https://sm-amazon-nk.s3.us-east-2.amazonaws.com/train_10.csv'

df = pd.read_csv(input_data, index_col=0)
df.head(n=10)

Unnamed: 0,label,text
0,1,Stuning even for the non-gamerThis sound track...
1,1,The best soundtrack ever to anything.I'm readi...
2,1,Amazing!This soundtrack is my favorite music o...
3,1,Excellent SoundtrackI truly like this soundtra...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."
5,1,an absolute masterpieceI am quite sure any of ...
6,0,"Buyer bewareThis is a self-published book, and..."
7,1,Glorious storyI loved Whisper of the wicked sa...
8,1,A FIVE STAR BOOKI just finished reading Whispe...
9,1,Whispers of the Wicked SaintsThis was a easy t...


In [26]:
%%writefile preprocessing-container.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer

from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action="ignore", category=DataConversionWarning)
columns=['label','text']

if __name__ =='__main__':
    #instantiate parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', 
                        type=float, 
                        default=0.3)
    print('Received arguments {}'.format(args))
    
    #read from input file
    input_data_path = os.path.join("/opt/ml/processing/input", "train_10.csv")
    print("Reading input data from {}".format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df, columns=columns)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    
    #check data ratio
    negative_examples, positive_examples = np.bincount(df["label"])
    print(
        "Data after cleaning: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )
    
    #train test split
    split_ratio = args.train_test_split_ratio
    print("Splitting data into train and test sets with ratio {}".format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop("label", axis=1), df["label"], test_size=split_ratio, random_state=0
    )
    
    #intantiate preprocess
    tfidf = TfidfVectorizer(strip_accents='ascii', 
                        lowercase=True,
                        analyzer = 'word',
                        stop_words='english',
                        token_pattern = r'(?u)\b\w\w+\b',\
                        max_df=0.95,
                        min_df = 5
                       )
    tfidf.fit(X_train['text'])
    
    #run the preprocessing job
    print("Running preprocessing and feature engineering transformations")
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)
    
    print("Train data shape after preprocessing: {}".format(train_features.shape))
    print("Test data shape after preprocessing: {}".format(test_features.shape))

    train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv")
    train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv")

    test_features_output_path = os.path.join("/opt/ml/processing/test", "test_features.csv")
    test_labels_output_path = os.path.join("/opt/ml/processing/test", "test_labels.csv")

    print("Saving training features to {}".format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)

    print("Saving test features to {}".format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)

    print("Saving training labels to {}".format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)

    print("Saving test labels to {}".format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)
    
    

Overwriting preprocessing-container.py


In [27]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code="preprocessing-container.py",
    inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    ],
    arguments=["--train-test-split-ratio", "0.2"],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_test_data = output["S3Output"]["S3Uri"]

FileNotFoundError: [Errno 2] No such file or directory: 'https://sm-amazon-nk.s3.us-east-2.amazonaws.com/train_10.csv'