In [1]:
import os
import boto3
import re
import json
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel
from sklearn.model_selection import train_test_split

region = boto3.Session().region_name
sm_boto3 = boto3.client("sagemaker")

role = get_execution_role()
sess=sagemaker.Session()
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/ModelBloodPressure"

print(f"bucket: {bucket}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
bucket: sagemaker-eu-west-3-024848443248


## Import Library and Dataset

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score




In [3]:
df= pd.read_csv('blood_pressure_prediction_dataset.csv')


In [4]:
df.head()

Unnamed: 0,heart_rate,body_temperature,oxygen_saturation,systolic_bp,diastolic_bp
0,92.634938,36.937121,100.0,132.901502,97.183033
1,48.590293,36.16639,85.80877,92.71569,77.088107
2,115.028821,36.897891,89.449863,129.637104,83.614922
3,79.530937,37.5,90.819953,130.044477,88.213206
4,55.46399,37.22586,81.925893,109.12701,74.569882


## Exploratory data analysis

In [5]:
df.duplicated().sum()


1

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   heart_rate         1000 non-null   float64
 1   body_temperature   1000 non-null   float64
 2   oxygen_saturation  1000 non-null   float64
 3   systolic_bp        1000 non-null   float64
 4   diastolic_bp       1000 non-null   float64
dtypes: float64(5)
memory usage: 39.2 KB


In [7]:
df.isnull().sum()

heart_rate           0
body_temperature     0
oxygen_saturation    0
systolic_bp          0
diastolic_bp         0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,heart_rate,body_temperature,oxygen_saturation,systolic_bp,diastolic_bp
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,79.842176,36.964337,95.272186,125.259808,85.158519
std,19.180861,0.421716,6.339241,14.534694,9.809933
min,40.0,36.0,59.946308,90.0,60.0
25%,66.542222,36.658576,91.466964,115.432555,78.435703
50%,80.599156,37.003479,98.686481,124.981266,85.243901
75%,93.189843,37.358042,100.0,135.186605,91.613573
max,120.0,37.5,100.0,160.0,110.0


In [9]:
df.isnull().sum()

heart_rate           0
body_temperature     0
oxygen_saturation    0
systolic_bp          0
diastolic_bp         0
dtype: int64

## Data Preprocessing

In [10]:
def remove_outliers(df1, column):
    Q1 = df1[column].quantile(0.25)
    Q3 = df1[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df1[(df1[column] >= lower_bound) & (df1[column] <= upper_bound)]

for col in df.columns:
    df = remove_outliers(df, col)

plt.show()



## Train Test split & Model building


In [11]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['systolic_bp','diastolic_bp'],axis=1)
y = df[['systolic_bp','diastolic_bp']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model LinearRegression evaluation


In [12]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=23)

plr = LinearRegression().fit(X_train,y_train)

y_train_pred = plr.predict(X_train)
y_test_pred = plr.predict(X_test)

print(plr.score(X_test,y_test))

0.8372988976414613


In [13]:
predictions = plr.predict(np.array([[85, 37.0, 88]]))

print(predictions)

[[121.73971767  83.04271456]]




## Model linear regression Saving

In [14]:
import joblib
joblib.dump(plr, 'linear_regression.pkl')


['linear_regression.pkl']

In [15]:
trainX= pd.DataFrame(X_train)
trainX[['systolic_bp','diastolic_bp']]=y_train
testX=pd.DataFrame(X_test)
testX[['systolic_bp','diastolic_bp']]=y_test

In [16]:
trainX.to_csv("train-V1.csv",index=False)
testX.to_csv("test-V1.csv",index=False)

In [17]:
sk_prefix="sagemaker/blood_pressure_regression/skcontainer"
trainpath=sess.upload_data(path="train-V1.csv", bucket=bucket, key_prefix=sk_prefix)
testpath=sess.upload_data(path="test-V1.csv",bucket=bucket, key_prefix=sk_prefix)

In [18]:
print(trainpath)
print(testpath)

s3://sagemaker-eu-west-3-024848443248/sagemaker/blood_pressure_regression/skcontainer/train-V1.csv
s3://sagemaker-eu-west-3-024848443248/sagemaker/blood_pressure_regression/skcontainer/test-V1.csv


In [19]:
import tarfile

with tarfile.open('linear_regression.tar.gz', mode='w:gz') as archive:
    archive.add('linear_regression.pkl')

In [20]:
fObj = open("linear_regression.tar.gz", "rb")
key = os.path.join(prefix, "linear_regression.tar.gz")
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(fObj)

In [21]:
model_data = "s3://{}/{}".format(bucket, key)
print(f"model data: {model_data}")

model data: s3://sagemaker-eu-west-3-024848443248/sagemaker/ModelBloodPressure/linear_regression.tar.gz


In [22]:
%%writefile script.py
import json
import numpy as np
import io
from io import BytesIO
import argparse
import os
import joblib
import json
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

def input_fn(request_body, content_type):
    if content_type == 'application/json':
        request = json.loads(request_body)
        return pd.DataFrame(request)
    elif content_type == 'text/csv':
        return pd.read_csv(io.StringIO(request_body), header=None)
    elif content_type == 'application/x-npy':
        stream = BytesIO(request_body)
        data = np.load(stream)
        return pd.DataFrame(data)
    elif content_type == 'text/libsvm':
        from sklearn.datasets import load_svmlight_file
        stream = io.StringIO(request_body)
        X, y = load_svmlight_file(stream)
        return pd.DataFrame(X)
    else:
        raise ValueError(f'Unsupported content type: {content_type}')

def predict_fn(input_data, model):
    predictions = model.predict(input_data)
    return predictions

def output_fn(prediction, accept):
    if accept == 'application/json':
        response = prediction.tolist()
        return json.dumps(response), 'application/json'
    
    elif accept == 'text/csv':
        output = io.StringIO()
        pd.DataFrame(prediction).to_csv(output, header=False, index=False)
        return output.getvalue(), 'text/csv'
    
    elif accept == 'text/html; charset=utf-8':
        html = "<html><body><h2>Prediction Results</h2><table border='1'>"
        for row in prediction:
            html += "<tr>"
            for val in row:
                html += f"<td>{val}</td>"
            html += "</tr>"
        html += "</table></body></html>"
        return html, 'text/html; charset=utf-8'
    else:
        raise ValueError(f'Unsupported content type: {accept}')

if __name__ == "__main__":
    print("Extracting arguments")
    parser = argparse.ArgumentParser()

    parser.add_argument("--fit-intercept", type=bool, default=True)

    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V1.csv")
    parser.add_argument("--test-file", type=str, default="test-V1.csv")
    
    args, _ = parser.parse_known_args()
    
    print("Reading data")

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("Building training and testing datasets")
    X_train = train_df.drop(columns=[train_df.columns[-2], train_df.columns[-1]])
    X_test = test_df.drop(columns=[test_df.columns[-2], test_df.columns[-1]])
    y_train = train_df[[train_df.columns[-2], train_df.columns[-1]]]
    y_test = test_df[[train_df.columns[-2], train_df.columns[-1]]]
    print("Training model")

    model = LinearRegression(fit_intercept=args.fit_intercept)
    
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("Validating model")
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    print(f"Train MSE: {train_mse:.3f}")
    print(f"Test MSE: {test_mse:.3f}")

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(f"Train R² Score: {train_r2:.3f}")
    print(f"Test R² Score: {test_r2:.3f}")

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print(f"Model saved at {model_path}")

Overwriting script.py


In [23]:
! python script.py --fit-intercept True \
                 --model-dir ./ \
                 --train ./ \
                 --test ./ \
                 --train-file train-V1.csv \
                 --test-file test-V1.csv


Extracting arguments
Reading data
Building training and testing datasets
Training model
Validating model
Train MSE: 15.725
Test MSE: 17.542
Train R² Score: 0.865
Test R² Score: 0.837
Model saved at ./model.joblib


In [24]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

FRAMEWORK_VERSION = "1.2-1"  

sklearn_estimator = SKLearn(
    entry_point="script.py",  
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="poly-reg-training-job",  
    hyperparameters={
        "fit-intercept": "True"  
    },
)

In [26]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: poly-reg-training-job-2024-09-23-20-07-21-551


2024-09-23 20:07:22 Starting - Starting the training job...
2024-09-23 20:07:37 Starting - Preparing the instances for training...
2024-09-23 20:07:57 Downloading - Downloading input data...
2024-09-23 20:08:22 Downloading - Downloading the training image..[34m2024-09-23 20:09:02,855 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-09-23 20:09:02,857 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-23 20:09:02,860 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-09-23 20:09:02,877 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-09-23 20:09:03,139 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-23 20:09:03,142 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-09-23 20:09:03,163 sagemaker

In [27]:
artifact = sm_boto3.describe_training_job(
   TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

Model artifact persisted at s3://sagemaker-eu-west-3-024848443248/poly-reg-training-job-2024-09-23-20-07-21-551/output/model.tar.gz


In [28]:
import sagemaker
from sagemaker.sklearn import SKLearnModel

role = sagemaker.get_execution_role()
artifact_path = "s3://sagemaker-eu-west-3-024848443248/poly-reg-training-job-2024-09-09-09-46-19-927/output/model.tar.gz"

model = SKLearnModel(
    role=role,
    model_data=artifact_path,
    framework_version="1.2-1",
    py_version="py3",
    entry_point="script.py",
)


In [33]:
predictor = model.deploy(instance_type='ml.c5.large', initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-09-23-20-14-06-684
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-09-23-20-14-07-414
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-09-23-20-14-07-414


-------!

In [34]:
import pandas as pd
import numpy as np

new_data = pd.DataFrame({
    'heart_rate': [88],
    'body_temperature': [37.3],
    'oxygen_saturation': [77],
})

payload = new_data.to_csv(header=False, index=False).strip()

In [35]:
json_data = new_data.values.tolist()

In [36]:
import json

json_payload = json.dumps(json_data)
print("JSON formatted input:\n", json_payload)


JSON formatted input:
 [[88.0, 37.3, 77.0]]


In [38]:
import boto3
import numpy as np
from io import BytesIO

client = boto3.client('runtime.sagemaker')

test_data = np.array([[80, 36.5, 95]])  

buffer = BytesIO()
np.save(buffer, test_data)  
buffer.seek(0)  

response = client.invoke_endpoint(
    EndpointName='sagemaker-scikit-learn-2024-09-23-20-14-07-414',  
    Body=buffer.getvalue(),  
    ContentType='application/x-npy'  
)

response_body = response['Body'].read().decode("utf-8")
print(response_body)


[[118.90291949174195, 82.12846207245718]]
