In [73]:
# Dataset modeling prerequisites
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# Initialize S3 details
BUCKET_URI = "s3://dandadan"
BUCKET_NAME = "dandadan"
DATASET_PATH = f"{BUCKET_URI}/mobile_features_ds.csv"

# Initialize S3 session
s3 = boto3.client("s3")

# Load your dataset directly from S3
obj = s3.get_object(Bucket=BUCKET_NAME, Key='mobile_features_ds.csv')
df = pd.read_csv(obj['Body'])

# Encode necessary categorical columns (if any)
le = LabelEncoder()
categorical_columns = ['clock_speed', 'm_dep']  # Assuming these columns need encoding
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Define target and features
TARGET_NAME = "price_range"
features_to_scale = ['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 
                     'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 
                     'sc_h', 'sc_w', 'talk_time']

# Split dataset into features and target
X = df.drop(columns=[TARGET_NAME])
y = df[TARGET_NAME]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling only to the numerical columns
X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

# Save the scaler for future use (during inference)
scaler_filename = "scaler.pkl"
joblib.dump(scaler, scaler_filename)

# Upload the scaler to S3
s3_resource = boto3.resource("s3")
scaler_s3_path = f"{BUCKET_URI}/scaler/scaler.pkl"
s3_resource.Bucket(BUCKET_NAME).upload_file(scaler_filename, "scaler/scaler.pkl")
print(f"Scaler uploaded to S3 at: {scaler_s3_path}")

# Save train and test data locally with headers
train_file = "train_scaled.csv"
test_file = "test_scaled.csv"
pd.concat([pd.DataFrame(y_train).reset_index(drop=True), X_train.reset_index(drop=True)], axis=1).to_csv(train_file, index=False, header=True)
pd.concat([pd.DataFrame(y_test).reset_index(drop=True), X_test.reset_index(drop=True)], axis=1).to_csv(test_file, index=False, header=True)

# Upload the training and test CSVs to S3
train_s3_path = f"{BUCKET_URI}/train/train_scaled.csv"
test_s3_path = f"{BUCKET_URI}/test/test_scaled.csv"
s3_resource.Bucket(BUCKET_NAME).upload_file(train_file, "train/train_scaled.csv")
s3_resource.Bucket(BUCKET_NAME).upload_file(test_file, "test/test_scaled.csv")
print(f"Scaled training data uploaded to S3 at: {train_s3_path}")
print(f"Scaled test data uploaded to S3 at: {test_s3_path}")


Scaler uploaded to S3 at: s3://dandadan/scaler/scaler.pkl
Scaled training data uploaded to S3 at: s3://dandadan/train/train_scaled.csv
Scaled test data uploaded to S3 at: s3://dandadan/test/test_scaled.csv


In [74]:
!pip install xgboost==1.5.0




In [13]:
#RandomSearch to find best hyper parameters for model training

import boto3
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize S3 details
BUCKET_NAME = "dandadan"
DATASET_PATH = 'mobile_features_ds.csv'

# Initialize S3 session
s3 = boto3.client("s3")

# Load your dataset directly from S3
obj = s3.get_object(Bucket=BUCKET_NAME, Key=DATASET_PATH)
df = pd.read_csv(obj['Body'])

# Define target and features
TARGET_NAME = "price_range"
X = df.drop(columns=[TARGET_NAME])
y = df[TARGET_NAME]

# Encode target labels as integers starting from 0
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Search
param_dist = {
    'n_estimators': [50, 100, 200, 300, 500],  # Number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],  # Step size shrinkage
    'max_depth': [3, 5, 7, 9],  # Maximum depth of a tree
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used per tree
    'gamma': [0, 0.1, 0.5, 1, 2]  # Minimum loss reduction required for further partitioning
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)  # Add use_label_encoder=False

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of random combinations to try
    scoring='accuracy',  # Use accuracy as the metric for evaluation
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Print progress
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV on your training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found by Random Search
print("Best Hyperparameters from Random Search:", random_search.best_params_)

# Get the best score achieved with the best hyperparameters
print("Best Cross-Validation Score Achieved:", random_search.best_score_)

# Use the best estimator (model with best hyperparameters) to predict on the test set
best_model = random_search.best_estimator_

# Predict on the test data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {100 * test_accuracy:.2f} %")



Fitting 3 folds for each of 100 candidates, totalling 300 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KeyboardInterrupt: 

In [75]:
%%writefile script.py

from argparse import ArgumentParser
import os
import pandas as pd
import numpy as np
import joblib
import json
import xgboost as xgb
from sklearn.metrics import balanced_accuracy_score

def model_fn(model_dir):
    return joblib.load(os.path.join(model_dir, "model.joblib"))

def input_fn(request_body, request_content_type):
    input_data_list = json.loads(request_body)
    return np.array(input_data_list)

def predict_fn(input_data, model):
    prediction = model.predict(input_data)
    return prediction.tolist()

def output_fn(prediction, content_type):
    return json.dumps(prediction)

if __name__ == "__main__":
    print("Extracting arguments...")
    parser = ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n-estimators", type=int, default=200)  # Best value from Random Search
    parser.add_argument("--learning-rate", type=float, default=0.2)  # Best value from Random Search
    parser.add_argument("--max-depth", type=int, default=3)  # Best value from Random Search
    parser.add_argument("--subsample", type=float, default=0.8)  # Best value from Random Search
    parser.add_argument("--gamma", type=float, default=0.1)  # Best value from Random Search
    parser.add_argument("--colsample-bytree", type=float, default=1.0)  # Best value from Random Search

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train_scaled.csv")
    parser.add_argument("--test-file", type=str, default="test_scaled.csv")
    
    # Local path to save the model in the current directory
    parser.add_argument("--local-model-path", type=str, default="model_local.joblib")  
    args, _ = parser.parse_known_args()

    print("Train channel:", args.train)
    print("Test channel:", args.test)

    # Load training and testing data
    print("Reading data...")
    df_train = pd.read_csv(os.path.join(args.train, args.train_file))
    df_test = pd.read_csv(os.path.join(args.test, args.test_file))

    # Check for NaN values
    if df_train.isnull().any().any():
        raise ValueError("Training data contains NaN values.")

    print("Building training and testing datasets...")
    TARGET_NAME = "price_range"
    all_columns_name = [col for col in df_train.columns if col not in [TARGET_NAME, 'id', 'Unnamed: 0']]
    
    X_train = df_train[all_columns_name]
    y_train = df_train[TARGET_NAME].values

    # Train model using XGBoost
    print("Training model...")
    model = xgb.XGBClassifier(
        n_estimators=args.n_estimators,
        learning_rate=args.learning_rate,
        max_depth=args.max_depth,
        subsample=args.subsample,
        gamma=args.gamma,
        colsample_bytree=args.colsample_bytree,
        eval_metric="mlogloss",
        use_label_encoder=False,  # Important for newer versions of XGBoost
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    # Validate model
    print("Validating model...")
    bal_acc_train = balanced_accuracy_score(y_train, model.predict(X_train))
    y_test = df_test[TARGET_NAME].values
    bal_acc_test = balanced_accuracy_score(y_test, model.predict(df_test[all_columns_name]))

    print(f"Train balanced accuracy: {100 * bal_acc_train:.3f} %")
    print(f"Test balanced accuracy: {100 * bal_acc_test:.3f} %")

    # Persist model to SageMaker model directory
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("Model persisted at " + path)

    # Save the model locally in the current directory
    local_model_path = os.path.join(os.getcwd(), args.local_model_path)  # Save in the current directory
    joblib.dump(model, local_model_path)
    print(f"Model saved locally at {local_model_path}")


Overwriting script.py


In [76]:
#Loading script file to S3
import boto3

# Initialize S3 resource
s3 = boto3.resource('s3')
BUCKET_NAME = 'dandadan'
script_file = 'script.py'

# Upload the script to S3
s3.Bucket(BUCKET_NAME).upload_file(script_file, 'scripts/script.py')
print(f'Script uploaded to s3://{BUCKET_NAME}/scripts/{script_file}')


Script uploaded to s3://dandadan/scripts/script.py


In [77]:
#XG Boost estimator

import sagemaker
from sagemaker.xgboost import XGBoost

# Define the SageMaker session
sagemaker_session = sagemaker.Session()

exe_role = 'arn:aws:iam::872515256694:role/service-role/AmazonSageMaker-ExecutionRole-20241012T210214'

# Define the XGBoost estimator
xgb_estimator = XGBoost(
    entry_point='script.py',  # Path to your training script
    role= exe_role,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    framework_version='1.5-1',  # Use the version compatible with your code
    base_job_name='xgboost-sagemaker',  # Base name for the training job
    hyperparameters={
        'n-estimators': 200,  # Best hyperparameter value
        'learning-rate': 0.2,
        'max-depth': 3,
        'subsample': 0.8,
        'gamma': 0.1,
        'colsample-bytree': 1.0
    },
    use_spot_instances=True,  # Use spot instances to save costs
    max_wait=7200,  # Maximum time to wait for spot instances
    max_run=3600,   # Maximum time for the training job
)

# Launch the training job using the correct paths from your previous code
xgb_estimator.fit({"train": train_s3_path, "test": test_s3_path}, wait=True)


INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.c5.xlarge.
INFO:sagemaker:Creating training-job with name: xgboost-sagemaker-2024-11-20-03-06-55-133


2024-11-20 03:06:56 Starting - Starting the training job...
2024-11-20 03:07:09 Starting - Preparing the instances for training...
2024-11-20 03:07:37 Downloading - Downloading input data...
2024-11-20 03:08:07 Downloading - Downloading the training image...
2024-11-20 03:08:53 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-11-20 03:08:43.045 ip-10-0-249-87.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-11-20 03:08:43.064 ip-10-0-249-87.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-11-20:03:08:43:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-11-20:03:08:43:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-11-20:03:08:43:INFO] Invoking user training script.[0m
[34m[2024-11-20:03:08:43:INFO] Module script does not provide a setup.py. [0m
[34mGene

In [82]:
#Endpoint Creation

from sagemaker.xgboost import XGBoostModel
from sagemaker import get_execution_role

# model artifact S3 path
artifact_path = "https://sagemaker-us-east-2-872515256694.s3.us-east-2.amazonaws.com/xgboost-sagemaker-2024-11-20-03-06-55-133/output/model.tar.gz"


# Create an XGBoost model
model = XGBoostModel(
    model_data=artifact_path,
    role=get_execution_role(),  # Ensure you have the correct execution role
    entry_point="script.py",  # Path to your inference script
    framework_version="1.5-1",  # Match the version you used for training
)

# Deploy the model to an endpoint
predictor = model.deploy(
    instance_type="ml.c5.large",  # Choose the instance type
    initial_instance_count=1
)

print("Endpoint created:", predictor.endpoint_name)


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.c5.large.
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-11-20-03-18-36-310
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-11-20-03-18-36-818
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-11-20-03-18-36-818


------!Endpoint created: sagemaker-xgboost-2024-11-20-03-18-36-818


In [81]:
#Delete Endpoint
import boto3

# Create a SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Specify your endpoint name
endpoint_name = 'sagemaker-xgboost-2024-10-26-15-42-03-893'  # Replace with your endpoint name
endpoint_name = predictor.endpoint_name

# Delete the endpoint
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
print(f"Deleted endpoint: {endpoint_name}")


Deleted endpoint: sagemaker-xgboost-2024-11-20-03-10-24-501


In [83]:
# Test in "local" if the endpoint works
import boto3
import pandas as pd

runtime = boto3.client("sagemaker-runtime")
test_df = pd.read_csv("test_scaled.csv")
TARGET_NAME = "price_range"
all_columns_name = [col for col in test_df.columns if col not in [TARGET_NAME, 'id', 'Unnamed: 0']]
X_test = test_df[all_columns_name]

aa = X_test.sample(1)
data_send = str(aa.values.tolist())


print("data send", data_send)
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint,
    Body=data_send,
    ContentType="text/csv",
)


result = response["Body"].read().decode()

equivalent = {0:"Low Cost",
                  1:"Medium Cost",
                  2:"High Cost",
                  3:"Very High Cost"}
                  
predicted_label = equivalent[int(result[1])]
    
    
print(predicted_label)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


data send [[-1.651897076366327, 0.0, 0.4712277952726739, 0.0, 0.8506358319530646, 0.0, -0.289727944856067, -0.3576197377993809, 0.9444937469697804, 1.510315912095323, 0.5191904476506147, -0.813447391645092, -1.2496470844232883, 1.1883563314711472, 0.8991313612166442, 0.067867323135153, 0.3710710634780464, 0.0, 1.0, 1.0]]
High Cost


In [84]:
#Newtest

import boto3
import pandas as pd

# Initialize SageMaker runtime client
runtime = boto3.client("sagemaker-runtime")

# Load test data
test_df = pd.read_csv("test_scaled.csv")
TARGET_NAME = "price_range"

# Extract feature columns
all_columns_name = [col for col in test_df.columns if col not in [TARGET_NAME, 'id', 'Unnamed: 0']]
X_test = test_df[all_columns_name]

# Define the mapping for predictions
equivalent = {0: "Low Cost", 1: "Medium Cost", 2: "High Cost", 3: "Very High Cost"}

# Iterate over each row and print predictions
for index, row in X_test.iterrows():
    # Convert row to a list and format as a CSV string
    data_send = str([row.tolist()])

    # Send data to the endpoint
    response = runtime.invoke_endpoint(
        EndpointName=predictor.endpoint_name,  # Updated to endpoint_name
        Body=data_send,
        ContentType="text/csv",
    )

    # Decode the result and map to cost category
    result = response["Body"].read().decode()
    predicted_label = equivalent[int(result[1])]  # Ensure the correct index for prediction
    
    # Print the prediction
    print(f"Row {index + 1} Prediction: {predicted_label}")


Row 1 Prediction: Low Cost
Row 2 Prediction: High Cost
Row 3 Prediction: Medium Cost
Row 4 Prediction: Very High Cost
Row 5 Prediction: Medium Cost
Row 6 Prediction: Medium Cost
Row 7 Prediction: High Cost
Row 8 Prediction: Low Cost
Row 9 Prediction: Very High Cost
Row 10 Prediction: Medium Cost
Row 11 Prediction: Low Cost
Row 12 Prediction: Medium Cost
Row 13 Prediction: High Cost
Row 14 Prediction: Very High Cost
Row 15 Prediction: High Cost
Row 16 Prediction: High Cost
Row 17 Prediction: Very High Cost
Row 18 Prediction: Very High Cost
Row 19 Prediction: Medium Cost
Row 20 Prediction: Low Cost
Row 21 Prediction: Low Cost
Row 22 Prediction: Medium Cost
Row 23 Prediction: Medium Cost
Row 24 Prediction: High Cost
Row 25 Prediction: Low Cost
Row 26 Prediction: Medium Cost
Row 27 Prediction: Very High Cost
Row 28 Prediction: High Cost
Row 29 Prediction: High Cost
Row 30 Prediction: Low Cost
Row 31 Prediction: Low Cost
Row 32 Prediction: Low Cost
Row 33 Prediction: Very High Cost
Row 34 P

In [67]:
import boto3
import json

# Initialize SageMaker runtime client
runtime = boto3.client("sagemaker-runtime")

# Define your endpoint name
ENDPOINT_NAME = predictor.endpoint_name # Replace with your SageMaker endpoint name

# Define the raw input data
raw_input_data = "[[100, 0, 1, 0, 2, 0, 3, 50, 100, 200, 500, 4, 2, 6, 0, 0, 0, 0, 0, 0]]"

# Send the payload to the endpoint
try:
    response = runtime.invoke_endpoint(
        EndpointName=ENDPOINT_NAME,
        ContentType="application/json",
        Body=raw_input_data,
    )
    
    # Parse and print the response
    result = json.loads(response["Body"].read().decode("utf-8"))
    print("Prediction response:", result)

except Exception as e:
    print("Error invoking endpoint:", str(e))


Prediction response: {'prediction': [3]}


In [69]:
import boto3
import json

# Initialize the SageMaker runtime client
runtime = boto3.client("sagemaker-runtime")

# Define the endpoint name
ENDPOINT_NAME = predictor.endpoint_name  # Replace this with your actual endpoint name

# Define raw input data as a string of lists
test_inputs = (
    "[[100, 0, 1, 0, 2, 0, 3, 50, 100, 200, 500, 4, 2, 6, 0, 0, 0, 0, 0, 0], "
    "[300, 0, 2, 0, 3, 0, 6, 200, 400, 600, 800, 6, 3, 9, 0, 1, 0, 1, 0, 0], "
    "[700, 1, 4, 1, 5, 1, 8, 500, 800, 1000, 1200, 8, 4, 12, 1, 1, 1, 1, 0, 1]]"
)

print("Sending payload:", test_inputs)

try:
    # Invoke the SageMaker endpoint
    response = runtime.invoke_endpoint(
        EndpointName=ENDPOINT_NAME,
        ContentType="application/json",
        Body=test_inputs,
    )

    # Decode and parse the response
    result = json.loads(response["Body"].read().decode("utf-8"))
    print("Prediction response:", result)

except Exception as e:
    print("Error invoking endpoint:", str(e))


Sending payload: [[100, 0, 1, 0, 2, 0, 3, 50, 100, 200, 500, 4, 2, 6, 0, 0, 0, 0, 0, 0], [300, 0, 2, 0, 3, 0, 6, 200, 400, 600, 800, 6, 3, 9, 0, 1, 0, 1, 0, 0], [700, 1, 4, 1, 5, 1, 8, 500, 800, 1000, 1200, 8, 4, 12, 1, 1, 1, 1, 0, 1]]
Prediction response: {'prediction': [3, 3, 3]}
