In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import json
from sagemaker import Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# Original Test Code

In [2]:
region_name = 'us-east-1'
# Set the region using boto3
boto3.setup_default_session(region_name=region_name)

# Create a SageMaker session with the default boto3 session
session = Session(boto_session=boto3.Session())

# Get the execution role
role = get_execution_role()

Couldn't call 'get_role' to get Role ARN from role name sagemaker_notebook_jupyter_lab to get Role path.


In [3]:
# Specify a different region, e.g., us-east-1
# region = session.boto_region_name
rcf_image = sagemaker.image_uris.retrieve('randomcutforest', region=region_name)

In [4]:
# Prepare synthetic data to simulate traffic data
data = pd.DataFrame({
    'month': [1, 2, 3, 4, 5],
    'week': [1, 2, 3, 4, 5],
    'day': [1, 2, 3, 4, 5],
    'season': [1, 2, 3, 4, 1],
    'average_speed': [30, 45, 50, 60, 35],
    'car_density': [5, 10, 15, 20, 8]
})

In [5]:
# Scale average_speed and car_density
scaler = MinMaxScaler()
data[['average_speed', 'car_density']] = scaler.fit_transform(data[['average_speed', 'car_density']])

# Optionally add cyclic encoding for time-based features
data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 7)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 7)

# Drop original columns and save preprocessed data to CSV
preprocessed_data = data.drop(columns=['month', 'week', 'day', 'season'])
preprocessed_data.to_csv('data.csv', header=False, index=False)

In [7]:
# Upload data to S3
bucket = session.default_bucket()
prefix = 'rcf-example'
s3 = boto3.client('s3')
s3.upload_file('data.csv', bucket, f'{prefix}/input/data.csv')
input_data = f's3://{bucket}/{prefix}/input/data.csv'

ClientError: An error occurred (AccessDenied) when calling the CreateBucket operation: User: arn:aws:sts::559050226112:assumed-role/sagemaker_notebook_jupyter_lab/SageMaker is not authorized to perform: s3:CreateBucket on resource: "arn:aws:s3:::sagemaker-us-east-1-559050226112" because no identity-based policy allows the s3:CreateBucket action

In [6]:
# Configure the RCF model
num_samples_per_tree = 256
num_trees = 100

# Create the SageMaker estimator for RCF
rcf = sagemaker.estimator.Estimator(
    image_uri=rcf_image,
    role=role,
    instance_count=1,
    instance_type='ml.t3.medium',  # Free Tier eligible
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

rcf.set_hyperparameters(num_samples_per_tree=num_samples_per_tree, num_trees=num_trees)

# Train the model
rcf.fit({'train': input_data})

# Deploy the model to a SageMaker endpoint
rcf_predictor = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.t3.medium'
)



NameError: name 'bucket' is not defined

In [None]:
# Prepare synthetic data for inference
inference_data = np.array([[0.5, 0.8, 0.3, 0.7]])  # Example normalized feature array
payload = json.dumps(inference_data.tolist())

# Make a prediction
response = rcf_predictor.predict(payload)
result = json.loads(response)
print("Anomaly scores:", result)

# Clean up the endpoint
rcf_predictor.delete_endpoint()

# Rewrote Functions

In [5]:
pwd

'/home/ec2-user/SageMaker'

In [9]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import Session
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import json
import os

# Preprocessing Functions
def preprocess_traffic_data(data):
    """
    Preprocesses the traffic data by scaling and applying cyclic encoding for time-based features.

    Parameters:
    - data (pd.DataFrame): Raw traffic data with columns ['month', 'week', 'day', 'season', 'average_speed', 'car_density'].

    Returns:
    - pd.DataFrame: Preprocessed traffic data with cyclic encoding and scaled features.
    """
    # Scale average_speed and car_density
    scaler = MinMaxScaler()
    data[['average_speed', 'car_density']] = scaler.fit_transform(data[['average_speed', 'car_density']])

    # Apply cyclic encoding for time-based features
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    data['day_sin'] = np.sin(2 * np.pi * data['day'] / 7)
    data['day_cos'] = np.cos(2 * np.pi * data['day'] / 7)

    # Drop original columns and return preprocessed data
    return data.drop(columns=['month', 'week', 'day', 'season'])

def save_preprocessed_data_to_local(data, filename='data.csv'):
    """
    Saves the preprocessed data to a local CSV file.

    Parameters:
    - data (pd.DataFrame): Preprocessed traffic data.
    - filename (str): The name of the local file to be saved (default is 'data.csv').
    """
    # Save the preprocessed data to CSV
    data.to_csv(filename, header=False, index=False)
    return filename

def upload_to_s3(local_file_path, bucket_name, s3_key):
    """
    Uploads a local file to S3.

    Parameters:
    - local_file_path (str): The local file path of the CSV to be uploaded.
    - bucket_name (str): The name of the S3 bucket.
    - s3_key (str): The S3 object key (path where the file will be stored).
    """
    s3_client = boto3.client('s3')
    s3_client.upload_file(local_file_path, bucket_name, s3_key)
    
    # Return the full S3 URI
    return f"s3://{bucket_name}/{s3_key}"

# SageMaker Training Function
def train_rcf_model(input_data, output_data_file_path='output_data', region_name='us-east-1'):
    """
    Trains the RCF model using the preprocessed data stored locally or in S3.

    Parameters:
    - input_data (str): Local file path or S3 URI of the preprocessed data.
    - region_name (str): The AWS region for the SageMaker model.

    Returns:
    - sagemaker.estimator.Estimator: The trained RCF model.
    """
    # Get the RCF image URI
    rcf_image = sagemaker.image_uris.retrieve('randomcutforest', region=region_name)
    
    # Configure the RCF model
    num_samples_per_tree = 256
    num_trees = 100

    # Get the execution role and session (assuming you're using SageMaker)
    role = get_execution_role()
    session = Session()

    # Create SageMaker estimator for RCF
    rcf = sagemaker.estimator.Estimator(
        image_uri=rcf_image,
        role=role,
        instance_count=1,
        instance_type='ml.t3.medium',
        output_path=f'{output_data_file_path}',  # Saving output locally
        sagemaker_session=session
    )

    rcf.set_hyperparameters(num_samples_per_tree=num_samples_per_tree, num_trees=num_trees)

    # Train the model
    rcf.fit({'train': input_data})
    
    return rcf

# Deployment & Prediction Function
def deploy_and_predict(rcf_model, inference_data):
    """
    Deploys the RCF model and makes a prediction.

    Parameters:
    - rcf_model (sagemaker.estimator.Estimator): The trained RCF model.
    - inference_data (np.array): The data to be used for inference.

    Returns:
    - dict: The predicted anomaly scores.
    """
    # Deploy the model
    rcf_predictor = rcf_model.deploy(
        initial_instance_count=1,
        instance_type='ml.t3.medium'
    )
    
    # Prepare the payload for inference
    payload = json.dumps(inference_data.tolist())

    # Make a prediction
    response = rcf_predictor.predict(payload)
    result = json.loads(response)
    print("Anomaly scores:", result)

    # Clean up the endpoint
    rcf_predictor.delete_endpoint()

    return result

# Main workflow
def main(local_file=False, bucket_name=None, s3_key=None):
    """
    Main workflow to process data, train the model, and make predictions.

    Parameters:
    - local_file (bool): If True, the file is used locally; if False, the file is uploaded to S3.
    - bucket_name (str): The name of the S3 bucket (used only if local_file is False).
    - s3_key (str): The S3 key for the uploaded file (used only if local_file is False).
    """
    # Sample data (replace with your actual preprocessed data)
    data = pd.DataFrame({
        'month': [1, 2, 3, 4, 5],
        'week': [1, 2, 3, 4, 5],
        'day': [1, 2, 3, 4, 5],
        'season': [1, 2, 3, 4, 1],
        'average_speed': [30, 45, 50, 60, 35],
        'car_density': [5, 10, 15, 20, 8]
    })

    # Preprocess the data
    preprocessed_data = preprocess_traffic_data(data)

    # Save or upload data
    if local_file:
        local_file_path = save_preprocessed_data_to_local(preprocessed_data)
        input_data = local_file_path
    else:
        local_file_path = save_preprocessed_data_to_local(preprocessed_data)  # Save locally first
        input_data = upload_to_s3(local_file_path, bucket_name, s3_key)

    # Train the model
    rcf_model = train_rcf_model(input_data)

    # Inference data (example, replace with your own features)
    inference_data = np.array([[0.5, 0.8, 0.3, 0.7]])  # Example normalized feature array
    
    # Make a prediction
    result = deploy_and_predict(rcf_model, inference_data)

if __name__ == "__main__":
    # Call main() with local_file=True to work locally or local_file=False for S3 workflow
    main(local_file=False, bucket_name="your-s3-bucket-name", s3_key="data/traffic_data.csv")

Couldn't call 'get_role' to get Role ARN from role name sagemaker_notebook_jupyter_lab to get Role path.


ValueError: Cannot format input    average_speed  car_density  month_sin     month_cos   day_sin   day_cos
0       0.000000     0.000000   0.500000  8.660254e-01  0.781831  0.623490
1       0.500000     0.333333   0.866025  5.000000e-01  0.974928 -0.222521
2       0.666667     0.666667   1.000000  6.123234e-17  0.433884 -0.900969
3       1.000000     1.000000   0.866025 -5.000000e-01 -0.433884 -0.900969
4       0.166667     0.200000   0.500000 -8.660254e-01 -0.974928 -0.222521. Expecting one of str, TrainingInput, file_input or FileSystemInput