In [1]:
import os
import boto3
import re
import json
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel
from sklearn.model_selection import train_test_split

region = boto3.Session().region_name
sm_boto3 = boto3.client("sagemaker")
role = get_execution_role()
sess=sagemaker.Session()
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/modelRR"
print(f"bucket: {bucket}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
bucket: sagemaker-eu-west-3-024848443248


## Import Library and Dataset

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score




In [3]:
df1= pd.read_csv('respiartory_rate_prediction_dataset.csv')


In [4]:
df1.head()

Unnamed: 0,heart_rate,body_temperature,oxygen_saturation,respiratory_rate
0,69.636603,36.946136,92.055299,13.087644
1,118.919597,36.662769,100.0,18.0
2,90.183049,37.5,100.0,16.229694
3,93.26487,37.5,100.0,14.828433
4,42.267821,36.660542,87.598401,11.059947


## Exploratory data analysis

In [5]:
df1.duplicated().sum()


10

In [6]:
df= df1.drop_duplicates()


In [7]:
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 990 entries, 0 to 999
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   heart_rate         990 non-null    float64
 1   body_temperature   990 non-null    float64
 2   oxygen_saturation  990 non-null    float64
 3   respiratory_rate   990 non-null    float64
dtypes: float64(4)
memory usage: 38.7 KB


In [9]:
df.isnull().sum()

heart_rate           0
body_temperature     0
oxygen_saturation    0
respiratory_rate     0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,heart_rate,body_temperature,oxygen_saturation,respiratory_rate
count,990.0,990.0,990.0,990.0
mean,79.191258,36.937696,94.775231,14.159599
std,18.682453,0.449791,6.467968,1.872105
min,40.0,36.0,69.478515,10.5
25%,66.486974,36.617303,90.708322,12.847047
50%,79.044734,36.983784,97.655579,14.107378
75%,92.488,37.338772,100.0,15.455415
max,120.0,37.5,100.0,18.0


In [11]:
df.isnull().sum()

heart_rate           0
body_temperature     0
oxygen_saturation    0
respiratory_rate     0
dtype: int64

## Data Preprocessing

In [12]:
def remove_outliers(df2, column):
    Q1 = df2[column].quantile(0.25)
    Q3 = df2[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df2[(df2[column] >= lower_bound) & (df2[column] <= upper_bound)]

for col in df.columns:
    df = remove_outliers(df, col)

plt.show()

## Train Test split & Model building


In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['respiratory_rate'],axis=1)
y = df['respiratory_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, random_state=23)
modelRR =XGBRegressor()
modelRR.fit(X_train, y_train)

y_pred = modelRR.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Squared Error: 0.3636736542947787


In [14]:
predictions = modelRR.predict(np.array([[85, 37.0, 88]]))

print(predictions)

[13.566343]


## Model XGB evaluation


In [15]:
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.8843370188081348


## Model XGBoost Saving


In [16]:
import joblib

joblib.dump(modelRR, 'XGBoostRR.pkl')

['XGBoostRR.pkl']

In [17]:
trainX= pd.DataFrame(X_train)
trainX['respiratory_rate']=y_train
testX=pd.DataFrame(X_test)
testX['respiratory_rate']=y_test

In [18]:
trainX.to_csv("train-V2.csv",index=False)
testX.to_csv("test-V2.csv",index=False)

In [19]:
sk_prefix="sagemaker/RR_estimation/XGBcontainer"
trainpath=sess.upload_data(path="train-V2.csv", bucket=bucket, key_prefix=sk_prefix)
testpath=sess.upload_data(path="test-V2.csv",bucket=bucket, key_prefix=sk_prefix)

In [20]:
print(trainpath)
print(testpath)

s3://sagemaker-eu-west-3-024848443248/sagemaker/RR_estimation/XGBcontainer/train-V2.csv
s3://sagemaker-eu-west-3-024848443248/sagemaker/RR_estimation/XGBcontainer/test-V2.csv


In [21]:
import tarfile

with tarfile.open('XGBoostRR.tar.gz', mode='w:gz') as archive:
    archive.add('XGBoostRR.pkl')


In [22]:
fObj = open("XGBoostRR.tar.gz", "rb")
key = os.path.join(prefix, "XGBoostRR.tar.gz")
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(fObj)

In [23]:
model_data = "s3://{}/{}".format(bucket, key)
print(f"model data: {model_data}")

model data: s3://sagemaker-eu-west-3-024848443248/sagemaker/modelRR/XGBoostRR.tar.gz


In [24]:
import numpy as np
import json
import pandas as pd
import io
from io import BytesIO

In [25]:
%%writefile script.py
import json
import numpy as np
import io
from io import BytesIO
import argparse
import os
import joblib
import pandas as pd
import xgboost as xgb
from sklearn.metrics import r2_score
def model_fn(model_dir):
    plr = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return plr

def input_fn(request_body, content_type):
    if content_type == 'application/json':
        request = json.loads(request_body)
        return pd.DataFrame(request)
    elif content_type == 'text/csv':
        return pd.read_csv(io.StringIO(request_body), header=None)
    elif content_type == 'application/x-npy':
        stream = BytesIO(request_body)
        data = np.load(stream)
        return pd.DataFrame(data)
    elif content_type == 'text/libsvm':
        from sklearn.datasets import load_svmlight_file
        stream = io.StringIO(request_body)
        X, y = load_svmlight_file(stream)
        return pd.DataFrame(X)
    else:
        raise ValueError(f'Unsupported content type: {content_type}')

def predict_fn(input_data, model):
    predictions = model.predict(input_data)
    return predictions

def output_fn(prediction, accept):
    # Handle JSON output
    if accept == 'application/json':
        response = prediction.tolist()
        return json.dumps(response), 'application/json'
    
    # Handle CSV output
    elif accept == 'text/csv':
        output = io.StringIO()
        pd.DataFrame(prediction).to_csv(output, header=False, index=False)
        return output.getvalue(), 'text/csv'
    
    # Handle HTML output
    elif accept == 'text/html; charset=utf-8':
        # Convert the prediction to a simple HTML table
        html = "<html><body><h2>Prediction Results</h2><table border='1'>"
        for row in prediction:
            html += "<tr>"
            for val in row:
                html += f"<td>{val}</td>"
            html += "</tr>"
        html += "</table></body></html>"
        return html, 'text/html; charset=utf-8'
    
    # Fallback: Raise an error if content type is unsupported
    else:
        raise ValueError(f'Unsupported content type: {accept}')


if __name__ == "__main__":
    print("Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n-estimators", type=int, default=100)
    parser.add_argument("--learning-rate", type=float, default=0.1)
    parser.add_argument("--max-depth", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V2.csv")
    parser.add_argument("--test-file", type=str, default="test-V2.csv")
    
    args, _ = parser.parse_known_args()
    
    print("Reading data")

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("Building training and testing datasets")

    X_train = train_df.iloc[:, :-1]  
    X_test = test_df.iloc[:, :-1]   
    y_train = train_df.iloc[:, -1]   
    y_test = test_df.iloc[:, -1]
    
    print("Training model")

    model = xgb.XGBRegressor(
        n_estimators=args.n_estimators,
        learning_rate=args.learning_rate,
        max_depth=args.max_depth,
        objective='reg:squarederror'
    )

    model.fit(X_train, y_train)

    # Validate model
    print("Validating model")
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")

    # Persist model
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)

Overwriting script.py


In [26]:
! python script.py --n-estimators 100 \
                 --learning-rate 0.1 \
                 --max-depth 3 \
                 --model-dir ./ \
                 --train ./ \
                 --test ./ \
                 --train-file train-V2.csv \
                 --test-file test-V2.csv

Extracting arguments
Reading data
Building training and testing datasets
Training model
Validating model
Train R2: 0.943
Test R2: 0.910
Model persisted at ./model.joblib


In [27]:
from sagemaker.xgboost.estimator import XGBoost
import sagemaker
from sagemaker import get_execution_role

FRAMEWORK_VERSION = "1.5-1"  
xgboost_estimator = XGBoost(
    entry_point="script.py",  
    role=get_execution_role(),  
    instance_count=1,  
    instance_type="ml.c5.xlarge",  
    framework_version=FRAMEWORK_VERSION,  
    base_job_name="xgb-training-job", 
    hyperparameters={
        "n-estimators": 100,
        "learning-rate": 0.1,
        "max-depth": 3,
    },
)


In [28]:
xgboost_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: xgb-training-job-2024-09-23-20-05-13-292


2024-09-23 20:05:13 Starting - Starting the training job...
2024-09-23 20:05:37 Starting - Preparing the instances for training...
2024-09-23 20:06:02 Downloading - Downloading input data...
2024-09-23 20:06:37 Downloading - Downloading the training image...
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-09-23 20:07:10.450 ip-10-0-133-168.eu-west-3.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-09-23 20:07:10.469 ip-10-0-133-168.eu-west-3.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-09-23:20:07:10:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-09-23:20:07:10:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-09-23:20:07:10:INFO] Invoking user training script.[0m
[34m[2024-09-23:20:07:10:INFO] Module script does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2024-09-23:20:07:10:INFO] Generating setup.cfg[0m
[34m[202

In [29]:
artifact = sm_boto3.describe_training_job(
   TrainingJobName=xgboost_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
print("Model artifact persisted at " + artifact)

Model artifact persisted at s3://sagemaker-eu-west-3-024848443248/xgb-training-job-2024-09-23-20-05-13-292/output/model.tar.gz


In [30]:
from sagemaker.xgboost.model import XGBoostModel
from sagemaker import get_execution_role

artifact_path = "s3://sagemaker-eu-west-3-024848443248/xgb-training-job-2024-09-08-15-48-13-359/output/model.tar.gz"

model = XGBoostModel(
    model_data=artifact_path,  
    role=get_execution_role(), 
    entry_point="script.py", 
    framework_version="1.5-1"  
)

In [31]:
predictor = model.deploy(
    initial_instance_count=1,  # Start with 1 instance
    instance_type='ml.c5.xlarge'  # Select the appropriate instance type
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.c5.xlarge.
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-09-23-20-09-50-199
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-09-23-20-09-50-911
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-09-23-20-09-50-911


-----!

In [32]:
predictor

<sagemaker.xgboost.model.XGBoostPredictor at 0x7f7508f96770>

In [33]:
import pandas as pd
import numpy as np

new_data = pd.DataFrame({
    'heart_rate': [88],
    'body_temperature': [37.3],
    'oxygen_saturation': [77],
})

payload = new_data.to_csv(header=False, index=False).strip()

In [34]:
json_data = new_data.values.tolist()


In [35]:
import json

json_payload = json.dumps(json_data)
print("JSON formatted input:\n", json_payload)


JSON formatted input:
 [[88.0, 37.3, 77.0]]


In [36]:
print(type(json_payload))

<class 'str'>


In [37]:
result = predictor.predict(json_payload, initial_args={'ContentType': 'application/json'})

print("Prediction result:", result)


Prediction result: [['13.275839']]


In [38]:
from pandas import MultiIndex

In [39]:
import json

In [43]:
import boto3
import numpy as np
from io import BytesIO

client = boto3.client('runtime.sagemaker')

test_data = np.array([[80, 36.5, 95]])  

buffer = BytesIO()
np.save(buffer, test_data)  
buffer.seek(0)  

response = client.invoke_endpoint(
    EndpointName='sagemaker-xgboost-2024-09-23-20-09-50-911',  
    Body=buffer.getvalue(),  
    ContentType='application/x-npy'  
)

response_body = response['Body'].read().decode("utf-8")
print(response_body)


[13.719712257385254]
