# Manu's ECG Ailment Predictor with AWS Sagemaker Deployment

#### Note: to get started with this project, you will first need to set up an AWS virtual environment! Follow these steps: 
1. Install the AWS CLI: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
2. Create an AWS account and go to IAM. Then, create a user with administrator access.
3. Create a CLI access key for this user. Make sure to keep the access and secret access keys somewhere!
4. Then, in your terminal, type: "aws configure" (without the strings around it)
5. Log in with your keys and region (ex: "us-west-1"), don't put anything for output (just hit enter)
6. Then, open a blank jupyter notebook in your IDE
7. To create the virtual env, type the following command in terminal: "conda create -p myenv python=3.8"
8. Then, to activate the virtual env, type the following command in terminal: "conda activate myenv /\"
9. Use pip to install all imports in requirements.txt
10. You're ready to start!

In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'sagemakerproj1'
print("Using bucket " + bucket)


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/manuthakur/Library/Application Support/sagemaker/config.yaml
Using bucket sagemakerproj1


In [2]:
ecgWithRecord = pd.read_csv("ECGCvdata.csv")

In [3]:
ecg = ecgWithRecord.drop('RECORD', axis=1)

In [4]:
ecg.head()

Unnamed: 0,hbpermin,Pseg,PQseg,QRSseg,QRseg,QTseg,RSseg,STseg,Tseg,PTseg,...,RMSSD,QRSarea,QRSperi,PQslope,QRslope,RSslope,STslope,NN50,pNN50,ECG_signal
0,74.925669,0.076508,0.108889,0.088254,0.043571,0.193016,0.044683,0.104762,0.130476,0.301905,...,292.296636,18.457618,63.615239,-0.014364,0.07527,-0.070846,0.012606,2,5.882353,ARR
1,68.503469,0.072483,0.096181,0.093924,0.046267,0.19349,0.047656,0.099566,0.089149,0.28967,...,318.563915,23.04323,67.787773,-0.021207,0.083773,-0.077458,0.016175,1,3.225806,ARR
2,83.488603,0.071154,0.08661,0.03953,0.01859,0.132479,0.02094,0.092949,0.094444,0.219088,...,273.165412,10.756353,29.253827,-0.042542,,,0.027131,16,42.105263,ARR
3,68.503469,0.082812,0.10816,0.090365,0.045226,0.188802,0.045139,0.098437,0.088281,0.296962,...,313.046941,23.845086,65.228737,-0.017806,0.091591,-0.088559,0.017022,2,6.451613,ARR
4,82.08,0.07076,0.102632,0.101023,0.049415,0.193713,0.051608,0.09269,0.085965,0.296345,...,263.225677,20.942791,72.830353,-0.01665,0.064547,-0.062246,0.016191,2,5.405405,ARR


In [5]:
ecg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 55 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   hbpermin    1200 non-null   float64
 1   Pseg        1200 non-null   float64
 2   PQseg       1200 non-null   float64
 3   QRSseg      1200 non-null   float64
 4   QRseg       1200 non-null   float64
 5   QTseg       1200 non-null   float64
 6   RSseg       1200 non-null   float64
 7   STseg       1200 non-null   float64
 8   Tseg        1200 non-null   float64
 9   PTseg       1200 non-null   float64
 10  ECGseg      1200 non-null   float64
 11  QRtoQSdur   431 non-null    float64
 12  RStoQSdur   431 non-null    float64
 13  RRmean      1200 non-null   float64
 14  PPmean      1200 non-null   float64
 15  PQdis       1200 non-null   float64
 16  PonQdis     1200 non-null   float64
 17  PRdis       1200 non-null   float64
 18  PonRdis     1200 non-null   float64
 19  PSdis       1200 non-null  

In [6]:
ecg.isnull().sum()

hbpermin        0
Pseg            0
PQseg           0
QRSseg          0
QRseg           0
QTseg           0
RSseg           0
STseg           0
Tseg            0
PTseg           0
ECGseg          0
QRtoQSdur     769
RStoQSdur     769
RRmean          0
PPmean          0
PQdis           0
PonQdis         0
PRdis           0
PonRdis         0
PSdis           0
PonSdis         0
PTdis           0
PonTdis         0
PToffdis        0
QRdis           0
QSdis           0
QTdis           0
QToffdis        0
RSdis           0
RTdis           0
RToffdis        0
STdis           0
SToffdis        0
PonToffdis      0
PonPQang      557
PQRang        768
QRSang        769
RSTang        769
STToffang     426
RRTot           0
NNTot           0
SDRR            0
IBIM            0
IBISD           0
SDSD            0
RMSSD           0
QRSarea         0
QRSperi         0
PQslope         0
QRslope       768
RSslope       769
STslope         0
NN50            0
pNN50           0
ECG_signal      0
dtype: int

In [7]:
ecg.describe()

Unnamed: 0,hbpermin,Pseg,PQseg,QRSseg,QRseg,QTseg,RSseg,STseg,Tseg,PTseg,...,SDSD,RMSSD,QRSarea,QRSperi,PQslope,QRslope,RSslope,STslope,NN50,pNN50
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,...,1200.0,1200.0,1200.0,1200.0,1200.0,432.0,431.0,1200.0,1200.0,1200.0
mean,81.89398,0.060902,0.078099,0.048238,0.024437,0.141541,0.0238,0.093303,0.102714,0.21964,...,42.48406,199.417894,7.675047,23.915846,-0.071647,0.141737,-0.150496,0.041659,6.4425,8.112425
std,19.324351,0.009453,0.019897,0.034649,0.017501,0.029225,0.017175,0.013332,0.022594,0.046359,...,105.722123,112.292014,8.452832,25.070324,0.049705,0.073086,0.077532,0.026456,7.748445,10.325033
min,12.857143,0.021556,0.044526,0.0,0.0,0.098761,0.0,0.052338,0.034936,0.15,...,0.800624,61.711281,0.0,0.0,-0.219938,0.02866,-0.315525,0.008183,0.0,0.0
25%,67.563,0.053943,0.060344,0.015101,0.007804,0.116016,0.007143,0.089423,0.090046,0.177343,...,4.060951,123.147741,0.0,0.124186,-0.10666,0.082613,-0.240304,0.023853,1.0,0.613497
50%,79.872,0.060642,0.075184,0.044692,0.022676,0.137019,0.021965,0.097825,0.09921,0.215223,...,11.265829,183.603851,5.228299,19.90406,-0.058518,0.108418,-0.116241,0.03446,3.0,3.680982
75%,96.0,0.066849,0.09519,0.083032,0.041915,0.166314,0.041097,0.1012,0.112408,0.260269,...,44.785825,248.101035,14.355389,46.194325,-0.027685,0.229659,-0.087565,0.049778,10.0,12.883436
max,160.5,0.095317,0.145577,0.12016,0.065278,0.211111,0.058333,0.138444,0.198413,0.347324,...,1725.927603,1478.48846,33.131007,86.577082,-0.005427,0.270786,-0.027983,0.164633,37.0,52.727273


In [8]:
ecg['ECG_signal'].value_counts()
# ECG signal is the target column

ECG_signal
ARR    300
AFF    300
CHF    300
NSR    300
Name: count, dtype: int64

In [9]:
ecgNoSignal = ecg.drop('ECG_signal', axis=1)
ecgNoSignal = ecgNoSignal.fillna(ecgNoSignal.median())

In [10]:
ecgNoSignal['ECG_signal'] = ecg['ECG_signal']
ecgFilled = ecgNoSignal

In [11]:
ecgFilled.columns

Index(['hbpermin', 'Pseg', 'PQseg', 'QRSseg', 'QRseg', 'QTseg', 'RSseg',
       'STseg', 'Tseg', 'PTseg', 'ECGseg', 'QRtoQSdur', 'RStoQSdur', 'RRmean',
       'PPmean', 'PQdis', 'PonQdis', 'PRdis', 'PonRdis', 'PSdis', 'PonSdis',
       'PTdis', 'PonTdis', 'PToffdis', 'QRdis', 'QSdis', 'QTdis', 'QToffdis',
       'RSdis', 'RTdis', 'RToffdis', 'STdis', 'SToffdis', 'PonToffdis',
       'PonPQang', 'PQRang', 'QRSang', 'RSTang', 'STToffang', 'RRTot', 'NNTot',
       'SDRR', 'IBIM', 'IBISD', 'SDSD', 'RMSSD', 'QRSarea', 'QRSperi',
       'PQslope', 'QRslope', 'RSslope', 'STslope', 'NN50', 'pNN50',
       'ECG_signal'],
      dtype='object')

In [12]:
ecgFilled.isnull().sum()

hbpermin      0
Pseg          0
PQseg         0
QRSseg        0
QRseg         0
QTseg         0
RSseg         0
STseg         0
Tseg          0
PTseg         0
ECGseg        0
QRtoQSdur     0
RStoQSdur     0
RRmean        0
PPmean        0
PQdis         0
PonQdis       0
PRdis         0
PonRdis       0
PSdis         0
PonSdis       0
PTdis         0
PonTdis       0
PToffdis      0
QRdis         0
QSdis         0
QTdis         0
QToffdis      0
RSdis         0
RTdis         0
RToffdis      0
STdis         0
SToffdis      0
PonToffdis    0
PonPQang      0
PQRang        0
QRSang        0
RSTang        0
STToffang     0
RRTot         0
NNTot         0
SDRR          0
IBIM          0
IBISD         0
SDSD          0
RMSSD         0
QRSarea       0
QRSperi       0
PQslope       0
QRslope       0
RSslope       0
STslope       0
NN50          0
pNN50         0
ECG_signal    0
dtype: int64

In [13]:
ecg = ecgFilled

In [14]:
features = list(ecg.columns)
features

['hbpermin',
 'Pseg',
 'PQseg',
 'QRSseg',
 'QRseg',
 'QTseg',
 'RSseg',
 'STseg',
 'Tseg',
 'PTseg',
 'ECGseg',
 'QRtoQSdur',
 'RStoQSdur',
 'RRmean',
 'PPmean',
 'PQdis',
 'PonQdis',
 'PRdis',
 'PonRdis',
 'PSdis',
 'PonSdis',
 'PTdis',
 'PonTdis',
 'PToffdis',
 'QRdis',
 'QSdis',
 'QTdis',
 'QToffdis',
 'RSdis',
 'RTdis',
 'RToffdis',
 'STdis',
 'SToffdis',
 'PonToffdis',
 'PonPQang',
 'PQRang',
 'QRSang',
 'RSTang',
 'STToffang',
 'RRTot',
 'NNTot',
 'SDRR',
 'IBIM',
 'IBISD',
 'SDSD',
 'RMSSD',
 'QRSarea',
 'QRSperi',
 'PQslope',
 'QRslope',
 'RSslope',
 'STslope',
 'NN50',
 'pNN50',
 'ECG_signal']

In [15]:
label = features.pop(-1)
label

'ECG_signal'

In [16]:
x = ecg[features]
y = ecg[label]

In [17]:
x.head()

Unnamed: 0,hbpermin,Pseg,PQseg,QRSseg,QRseg,QTseg,RSseg,STseg,Tseg,PTseg,...,SDSD,RMSSD,QRSarea,QRSperi,PQslope,QRslope,RSslope,STslope,NN50,pNN50
0,74.925669,0.076508,0.108889,0.088254,0.043571,0.193016,0.044683,0.104762,0.130476,0.301905,...,27.158481,292.296636,18.457618,63.615239,-0.014364,0.07527,-0.070846,0.012606,2,5.882353
1,68.503469,0.072483,0.096181,0.093924,0.046267,0.19349,0.047656,0.099566,0.089149,0.28967,...,9.665517,318.563915,23.04323,67.787773,-0.021207,0.083773,-0.077458,0.016175,1,3.225806
2,83.488603,0.071154,0.08661,0.03953,0.01859,0.132479,0.02094,0.092949,0.094444,0.219088,...,111.816694,273.165412,10.756353,29.253827,-0.042542,0.108418,-0.116241,0.027131,16,42.105263
3,68.503469,0.082812,0.10816,0.090365,0.045226,0.188802,0.045139,0.098437,0.088281,0.296962,...,11.193252,313.046941,23.845086,65.228737,-0.017806,0.091591,-0.088559,0.017022,2,6.451613
4,82.08,0.07076,0.102632,0.101023,0.049415,0.193713,0.051608,0.09269,0.085965,0.296345,...,60.245998,263.225677,20.942791,72.830353,-0.01665,0.064547,-0.062246,0.016191,2,5.405405


In [18]:
x.shape

(1200, 54)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.20, random_state=0)

In [20]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

In [21]:
trainX.shape

(240, 55)

In [22]:
testX = pd.DataFrame(X_test)
testX[label] = y_test

In [23]:
trainX.to_csv("train-v1.csv", index=False)
testX.to_csv("test-v1.csv", index=False)

In [24]:
sk_prefix = "sagemaker/ecg_heart_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-v1.csv", bucket=bucket, key_prefix=sk_prefix
    )
testpath = sess.upload_data(
    path="test-v1.csv", bucket=bucket, key_prefix=sk_prefix
    )
print(trainpath)
print(testpath)

s3://sagemakerproj1/sagemaker/ecg_heart_classification/sklearncontainer/train-v1.csv
s3://sagemakerproj1/sagemaker/ecg_heart_classification/sklearncontainer/test-v1.csv


In [25]:
%%writefile RF_script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib")) ###
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-v1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print(features)

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    # RF: 
    RFmodel = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)

    # GB: 
    # GBmodel =  GradientBoostingClassifier(n_estimators=args.n_estimators, random_state=args.random_state, learning_rate=0.1, 
    # max_depth = 3, min_samples_split=5, min_samples_leaf=2, subsample=0.8, verbose = 3)

    # LR and SVM: 
    # scaler = StandardScaler()
    # SVM_X_train_scaled = scaler.fit_transform(X_train)
    # SVM_X_test_scaled = scaler.transform(X_test)
    # # model = LogisticRegressionCV(Cs=10, cv=5, penalty='l1', solver='liblinear', random_state=42)
    # SVMmodel = SVC(kernel='rbf', C=1.0, random_state=42)

    # For RF and GB and SVM: 
    RFmodel.fit(X_train, y_train)
    # GBmodel.fit(X_train, y_train)
    # SVMmodel.fit(SVM_X_train_scaled, y_train) #don't fit/transform y!
    print()
    
    # save all to model.joblib (the same library)!
    # for sagemaker, simply change the desired model's to "model.joblib"
    # sagemaker only accepts one model as input
    # IF TREATING ENTIRE THING AS ONE MODEL, ONLY MAKE ONE PATH!
    RFmodel_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(RFmodel,RFmodel_path)
    print("Combined Model persisted at " + RFmodel_path)
    print()
    # GBmodel_path = os.path.join(args.model_dir, "GBmodel.joblib")
    # joblib.dump(GBmodel,GBmodel_path)
    # print("GB Model persisted at " + GBmodel_path)
    # print()
    # SVMmodel_path = os.path.join(args.model_dir, "SVMmodel.joblib")
    # joblib.dump(GBmodel,GBmodel_path)
    # print("SVM Model persisted at " + SVMmodel_path)
    # print()

    
    # y_pred_test = model.predict(X_test_scaled)
    RF_y_pred_test = RFmodel.predict(X_test)
    RF_test_acc = accuracy_score(y_test,RF_y_pred_test)
    RF_test_rep = classification_report(y_test,RF_y_pred_test)

    # GB_y_pred_test = GBmodel.predict(X_test)
    # GB_test_acc = accuracy_score(y_test,GB_y_pred_test)
    # GB_test_rep = classification_report(y_test,GB_y_pred_test)

    # SVM_y_pred_test = SVMmodel.predict(SVM_X_test_scaled)
    # SVM_test_acc = accuracy_score(y_test,SVM_y_pred_test)
    # SVM_test_rep = classification_report(y_test,SVM_y_pred_test)

    #For LR and SVM: 
    # train_accuracy = model.score(X_train_scaled, y_train)
    # test_accuracy = model.score(X_test_scaled, y_test)    

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] RF Model Accuracy is: ', RF_test_acc)
    print('[TESTING] RF Testing Report: ')
    print(RF_test_rep)
    print()
    # print('[TESTING] GB Model Accuracy is: ', GB_test_acc)
    # print('[TESTING] GB Testing Report: ')
    # print(GB_test_rep)
    # print()
    # print('[TESTING] SVM Model Accuracy is: ', SVM_test_acc)
    # print('[TESTING] SVM Testing Report: ')
    # print(SVM_test_rep)
    # print()



Overwriting RF_script.py


In [26]:
%%writefile GB_script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib")) ###BECUASE OF THIS, THE FILEPATH HAS TO END WITH "model.joblib"!!!
    #Or, change the name that this saves to to "{name}_model.joblib", as each model has its own function that can be called
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-v1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print(features)

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()

    # GB: 
    print("Training GradientBoosting Model.....")
    GBmodel =  GradientBoostingClassifier(n_estimators=args.n_estimators, random_state=args.random_state, learning_rate=0.1, 
    max_depth = 3, min_samples_split=5, min_samples_leaf=2, subsample=0.8, verbose = 3)
    GBmodel.fit(X_train, y_train)

    print()
    
    GBmodel_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(GBmodel,GBmodel_path)
    print("GB Model persisted at " + GBmodel_path)
    print()


    GB_y_pred_test = GBmodel.predict(X_test)
    GB_test_acc = accuracy_score(y_test,GB_y_pred_test)
    GB_test_rep = classification_report(y_test,GB_y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])

    print('[TESTING] GB Model Accuracy is: ', GB_test_acc)
    print('[TESTING] GB Testing Report: ')
    print(GB_test_rep)
    print()


Overwriting GB_script.py


In [27]:
%%writefile SVM_script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib")) ###
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-v1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print(features)

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    # LR and SVM: 
    print("Training SVM Model.....")
    scaler = StandardScaler()
    SVM_X_train_scaled = scaler.fit_transform(X_train)
    SVM_X_test_scaled = scaler.transform(X_test)
    # model = LogisticRegressionCV(Cs=10, cv=5, penalty='l1', solver='liblinear', random_state=42)
    SVMmodel = SVC(kernel='rbf', C=1.0, random_state=42)

    SVMmodel.fit(SVM_X_train_scaled, y_train) #don't fit/transform y!
    print()
    
    SVMmodel_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(SVMmodel,SVMmodel_path)
    print("SVM Model persisted at " + SVMmodel_path)
    print()

    SVM_y_pred_test = SVMmodel.predict(SVM_X_test_scaled)
    SVM_test_acc = accuracy_score(y_test,SVM_y_pred_test)
    SVM_test_rep = classification_report(y_test,SVM_y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])

    print('[TESTING] SVM Model Accuracy is: ', SVM_test_acc)
    print('[TESTING] SVM Testing Report: ')
    print(SVM_test_rep)
    print()



Overwriting SVM_script.py


In [28]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

RF_sklearn_estimator = SKLearn(
    entry_point="RF_script.py",
    role="arn:aws:iam::510855072877:role/service-role/AmazonSageMaker-ExecutionRole-20231230T072497",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    # base_job_name="GB-custom-sklearn",
    # base_job_name="SVM-custom-sklearn",
    hyperparameters={
        "n_estimators": 50,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)
    

In [29]:
FRAMEWORK_VERSION = "0.23-1"

GB_sklearn_estimator = SKLearn(
    entry_point="GB_script.py",
    role="arn:aws:iam::510855072877:role/service-role/AmazonSageMaker-ExecutionRole-20231230T072497",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    # base_job_name="Combined-custom-sklearn",
    base_job_name="GB-custom-sklearn",
    # base_job_name="SVM-custom-sklearn",
    hyperparameters={
        "n_estimators": 50,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [30]:
FRAMEWORK_VERSION = "0.23-1"

SVM_sklearn_estimator = SKLearn(
    entry_point="SVM_script.py",
    role="arn:aws:iam::510855072877:role/service-role/AmazonSageMaker-ExecutionRole-20231230T072497",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    # base_job_name="Combined-custom-sklearn",
    # base_job_name="GB-custom-sklearn",
    base_job_name="SVM-custom-sklearn",
    # hyperparameters={
    #     "n_estimators": 50,
    #     "random_state": 0,
    # },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [31]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import GradientBoostingClassifier

# # Define the parameter grid to search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.1, 0.05, 0.01],
#     'max_depth': [3, 4, 5]
# }

# # Create the Gradient Boosting Classifier
# gb = GradientBoostingClassifier(random_state=42)

# # GridSearchCV with the Gradient Boosting Classifier and parameter grid
# grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring='f1_macro')
# grid_search_gb.fit(X_train, y_train)  # Fit the grid search to your training data

# # Get the best parameters and best score from the grid search
# best_params_gb = grid_search_gb.best_params_
# best_score_gb = grid_search_gb.best_score_

In [32]:
# best_params_gb

In [33]:
#launch training job with asynch call
RF_sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-01-01-20-38-20-283


2024-01-01 20:38:21 Starting - Starting the training job...
2024-01-01 20:38:45 Starting - Preparing the instances for training......
2024-01-01 20:39:42 Downloading - Downloading input data...
2024-01-01 20:40:17 Downloading - Downloading the training image...
2024-01-01 20:40:47 Training - Training image download completed. Training in progress..[34m2024-01-01 20:40:52,881 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-01-01 20:40:52,887 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:40:52,955 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-01-01 20:40:53,194 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:40:53,218 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:40:53,241 sagemaker-training-toolkit INFO     No GPUs 

In [34]:
GB_sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: GB-custom-sklearn-2024-01-01-20-41-34-372


2024-01-01 20:41:35 Starting - Starting the training job...
2024-01-01 20:41:48 Starting - Preparing the instances for training......
2024-01-01 20:43:01 Downloading - Downloading input data...
2024-01-01 20:43:31 Downloading - Downloading the training image.....
2024-01-01 20:44:28 Training - Training image download completed. Training in progress.
2024-01-01 20:44:28 Uploading - Uploading generated training model
2024-01-01 20:44:28 Completed - Training job completed
[34m2024-01-01 20:44:10,560 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-01-01 20:44:10,564 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:44:10,616 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-01-01 20:44:10,784 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:44:10,797 sagemaker-training-toolkit INFO     

In [35]:
SVM_sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: SVM-custom-sklearn-2024-01-01-20-49-54-556


2024-01-01 20:49:55 Starting - Starting the training job...
2024-01-01 20:50:13 Starting - Preparing the instances for training......
2024-01-01 20:51:11 Downloading - Downloading input data...
2024-01-01 20:51:41 Downloading - Downloading the training image...
2024-01-01 20:52:22 Training - Training image download completed. Training in progress.
2024-01-01 20:52:22 Uploading - Uploading generated training model[34m2024-01-01 20:52:17,051 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-01-01 20:52:17,054 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:52:17,088 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-01-01 20:52:17,239 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-01 20:52:17,250 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m20

In [36]:
RF_sklearn_estimator.latest_training_job.wait(logs="None")
RF_artifact = sm_boto3.describe_training_job(
    TrainingJobName=RF_sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + RF_artifact)


2024-01-01 20:41:13 Starting - Preparing the instances for training
2024-01-01 20:41:13 Downloading - Downloading the training image
2024-01-01 20:41:13 Training - Training image download completed. Training in progress.
2024-01-01 20:41:13 Uploading - Uploading generated training model
2024-01-01 20:41:13 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-west-1-510855072877/RF-custom-sklearn-2024-01-01-20-38-20-283/output/model.tar.gz


In [37]:
GB_sklearn_estimator.latest_training_job.wait(logs="None")
GB_artifact = sm_boto3.describe_training_job(
    TrainingJobName=GB_sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + GB_artifact)


2024-01-01 20:44:28 Starting - Preparing the instances for training
2024-01-01 20:44:28 Downloading - Downloading the training image
2024-01-01 20:44:28 Training - Training image download completed. Training in progress.
2024-01-01 20:44:28 Uploading - Uploading generated training model
2024-01-01 20:44:28 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-west-1-510855072877/GB-custom-sklearn-2024-01-01-20-41-34-372/output/model.tar.gz


In [38]:
SVM_sklearn_estimator.latest_training_job.wait(logs="None")
SVM_artifact = sm_boto3.describe_training_job(
    TrainingJobName=SVM_sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + SVM_artifact)


2024-01-01 20:52:38 Starting - Preparing the instances for training
2024-01-01 20:52:38 Downloading - Downloading the training image
2024-01-01 20:52:38 Training - Training image download completed. Training in progress.
2024-01-01 20:52:38 Uploading - Uploading generated training model
2024-01-01 20:52:38 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-west-1-510855072877/SVM-custom-sklearn-2024-01-01-20-49-54-556/output/model.tar.gz


In [39]:
from sagemaker.sklearn.model import SKLearnModel
from urllib.parse import urlparse
from time import gmtime, strftime
import os

names = [
    "RF",
    "GB", 
    "SVM"
]
model_paths = [
    RF_artifact,
    GB_artifact, 
    SVM_artifact
]

scripts = [
    "RF_script.py",
    "GB_script.py", 
    "SVM_script.py"  
]

models = []

for i in range(3): 
    model_name = names[i] + "-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
    model = SKLearnModel(
        name =  model_name,
        model_data=model_paths[i],
        role="arn:aws:iam::510855072877:role/service-role/AmazonSageMaker-ExecutionRole-20231230T072497",
        entry_point=scripts[i],
        framework_version=FRAMEWORK_VERSION,
    )
    models.append(model)

In [40]:
models

[<sagemaker.sklearn.model.SKLearnModel at 0x28544a410>,
 <sagemaker.sklearn.model.SKLearnModel at 0x28544a290>,
 <sagemaker.sklearn.model.SKLearnModel at 0x177bc47d0>]

In [41]:
# Deploy all models
for i, model in enumerate(models):
    endpoint_name = names[i] + "-endpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
    if i == 0: 
        RF_predictor = model.deploy(
            initial_instance_count=1,
            instance_type="ml.m4.xlarge",
            endpoint_name=endpoint_name,
        )
    elif i == 1: 
        GB_predictor = model.deploy(
            initial_instance_count=1,
            instance_type="ml.m4.xlarge",
            endpoint_name=endpoint_name,
        )
    elif i == 2:  
        SVM_predictor = model.deploy(
            initial_instance_count=1,
            instance_type="ml.m4.xlarge",
            endpoint_name=endpoint_name,
        )

INFO:sagemaker:Creating model with name: RF-sklearn-model-2024-01-01-20-53-24
INFO:sagemaker:Creating endpoint-config with name RF-endpoint-2024-01-01-20-53-24
INFO:sagemaker:Creating endpoint with name RF-endpoint-2024-01-01-20-53-24


!

INFO:sagemaker:Creating model with name: GB-sklearn-model-2024-01-01-20-53-24
INFO:sagemaker:Creating endpoint-config with name GB-endpoint-2024-01-01-21-00-43
INFO:sagemaker:Creating endpoint with name GB-endpoint-2024-01-01-21-00-43


----!

INFO:sagemaker:Creating model with name: SVM-sklearn-model-2024-01-01-20-53-24
INFO:sagemaker:Creating endpoint-config with name SVM-endpoint-2024-01-01-21-03-35
INFO:sagemaker:Creating endpoint with name SVM-endpoint-2024-01-01-21-03-35


-!

In [56]:
testX[0:2][features].values.tolist()

[[56.16,
  0.053888889,
  0.063611111,
  0.003944444,
  0.002055556,
  0.108555556,
  0.001888889,
  0.104611111,
  0.102944444,
  0.172166667,
  0.225722222,
  0.002012727,
  0.001990266,
  374.72,
  374.72,
  18.05924934,
  24.93604234,
  18.05924934,
  24.93604234,
  18.05924934,
  24.93604234,
  56.56301129,
  63.4800056,
  67.60276894,
  0.0,
  0.0,
  38.57002847,
  49.60792149,
  0.0,
  38.57002847,
  49.60792149,
  38.57002847,
  49.60792149,
  74.52001282,
  9.281966808,
  -7.674113687,
  12.90826983,
  -8.506900816,
  1.533456947,
  26.0,
  49.0,
  21.66567793,
  300.3265306,
  55.91935175,
  30.33261789,
  375.3458139,
  0.0,
  0.0,
  -0.08088808,
  0.1084182995,
  -0.116240702,
  0.022725356,
  6.0,
  12.24489796],
 [72.0,
  0.059846154,
  0.073179487,
  0.028923077,
  0.014871795,
  0.118871795,
  0.014051282,
  0.089948718,
  0.110615385,
  0.192051282,
  0.273076923,
  0.002012727,
  0.001990266,
  206.6595745,
  206.1276596,
  19.80660452,
  25.40938417,
  24.95594325,
 

In [60]:
print(RF_predictor.predict(testX[0:10][features].values.tolist()))

['ARR' 'AFF' 'ARR' 'ARR' 'CHF' 'AFF' 'ARR' 'NSR' 'NSR' 'NSR']


In [None]:
['ARR' 'AFF' 'ARR' 'ARR' 'CHF' 'AFF' 'ARR' 'NSR' 'NSR' 'NSR']
['ARR' 'AFF' 'ARR' 'ARR' 'CHF' 'AFF' 'ARR' 'NSR' 'NSR' 'NSR']

In [None]:
# Delete all endpoints
endpoints = sm_boto3.list_endpoints()

for endpoint in endpoints['Endpoints']:
    print(endpoint['EndpointName'])
    sm_boto3.delete_endpoint(EndpointName=endpoint['EndpointName'])