In [1]:
!pip install sagemaker boto3 pandas scikit-learn seaborn matplotlib



In [10]:
import sagemaker
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sagemaker import Session
from sagemaker.sklearn import SKLearn
from sagemaker.model import Model
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import io
import boto3
import os

In [11]:
# Get SageMaker session and execution role
role = get_execution_role()
session = sagemaker.Session()

In [12]:
# Define the S3 bucket and key (path in the bucket)
bucket_name = 'test-harithabal'  # Replace with your bucket name
prefix = 'diabetes'  # Path inside the bucket
region = boto3.Session().region_name
s3 = boto3.client('s3')

In [13]:
# Upload the dataset to S3
data_file = "diabetes.csv"  # Make sure this file is available in your local directory
s3.upload_file(data_file, bucket_name, f"{prefix}/{data_file}")
s3_data_path = f"s3://{bucket_name}/{prefix}/{data_file}"
print(f"Dataset uploaded to: {s3_data_path}")


Dataset uploaded to: s3://test-harithabal/diabetes/diabetes.csv


In [14]:
data = pd.read_csv(s3_data_path)
data

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [15]:
# Prepare the data
x = data.drop("Outcome", axis=1)
y = np.ravel(data['Outcome'])


In [16]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=50)
x_train.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object

In [17]:
x_train['DiabetesPedigreeFunction']

636    0.153
140    0.268
313    0.626
75     0.140
492    0.145
       ...  
132    0.356
289    0.263
109    0.247
480    0.344
688    0.828
Name: DiabetesPedigreeFunction, Length: 384, dtype: float64

In [19]:
# Scale the data
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [20]:
# Save the training and testing data locally
train_data = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train)], axis=1)
test_data = pd.concat([pd.DataFrame(x_test), pd.DataFrame(y_test)], axis=1)

In [21]:
train_data.to_csv("train_data.csv", index=False, header=False)
test_data.to_csv("test_data.csv", index=False, header=False)

In [22]:
# Upload train and test data to S3
s3.upload_file("train_data.csv", bucket_name, f"{prefix}/train/train_data.csv")
s3.upload_file("test_data.csv", bucket_name, f"{prefix}/test/test_data.csv")

In [23]:
train_s3_path = f"s3://{bucket_name}/{prefix}/train/train_data.csv"
test_s3_path = f"s3://{bucket_name}/{prefix}/test/test_data.csv"
print(f"Train data uploaded to: {train_s3_path}")
print(f"Test data uploaded to: {test_s3_path}")

Train data uploaded to: s3://test-harithabal/diabetes/train/train_data.csv
Test data uploaded to: s3://test-harithabal/diabetes/test/test_data.csv


In [15]:
# Define a training script
!mkdir -p scripts
with open("scripts/sklearn_svc.py", "w") as f:
    f.write("""
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC
import joblib

# Model loading function
def model_fn(model_dir):
    \"\"\"
    Load the trained model from the specified directory.

    Args:
        model_dir (str): Directory where the model artifact is stored.

    Returns:
        model: Loaded model object.
    \"\"\"
    model_path = os.path.join(model_dir, "model.joblib")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
    model = joblib.load(model_path)
    return model

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", type=str, default="/opt/ml/input/data/train")  # Default SageMaker training input path
    parser.add_argument("--model-dir", type=str, default="/opt/ml/model")        # Default SageMaker model path
    args = parser.parse_args()

    # Load training data
    train_file = os.path.join(args.train, "train_data.csv")
    if not os.path.exists(train_file):
        raise FileNotFoundError(f"Training data file not found at {train_file}")
    
    train_data = pd.read_csv(train_file, header=None)
    x_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values

    # Train the model
    svc_model = SVC()  # Default parameters; you can tune this if needed
    svc_model.fit(x_train, y_train)

    # Save the model
    model_path = os.path.join(args.model_dir, "model.joblib")
    os.makedirs(args.model_dir, exist_ok=True)  # Ensure the model directory exists
    joblib.dump(svc_model, model_path)
    print(f"Model successfully saved at {model_path}")
    """)


In [16]:
# Set up the SageMaker SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point="sklearn_svc.py",
    source_dir="scripts",
    framework_version="1.0-1",
    py_version="py3",
    instance_type="ml.m5.large",
    role=role,
    output_path=f"s3://{bucket_name}/{prefix}/output",
)

In [17]:
# Train the model
sklearn_estimator.fit({"train": TrainingInput(train_s3_path, content_type="text/csv")})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-11-29-11-25-07-526


2024-11-29 11:25:08 Starting - Starting the training job...
2024-11-29 11:25:22 Starting - Preparing the instances for training...
2024-11-29 11:26:10 Downloading - Downloading the training image......
2024-11-29 11:27:11 Training - Training image download completed. Training in progress...[34m2024-11-29 11:27:16,129 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-11-29 11:27:16,133 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-29 11:27:16,135 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-29 11:27:16,150 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-11-29 11:27:16,373 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-29 11:27:16,376 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34

In [19]:
# Deploy the model to an endpoint
predictor = sklearn_estimator.deploy(instance_type="ml.m5.large", initial_instance_count=1)


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-11-29-11-31-27-000
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-11-29-11-31-27-000
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-11-29-11-31-27-000


------!

In [46]:

# Example test data: replace these values with actual test features
test_instance = np.array([6, 148, 72, 35, 0, 33.6, 0.627, 50])

# Convert the test instance to CSV string format
test_data = ','.join(map(str, test_instance))


In [52]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Load test data
test_data = pd.read_csv("test_data.csv", header=None)
x_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

# Convert to list for batch inference
test_payload = x_test.tolist()

# Get predictions from the endpoint
predictions = predictor.predict(test_payload)

# Convert predictions to a NumPy array
predicted_labels = np.array(predictions)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))

print("\nClassification Report:")
print(classification_report(y_test, predicted_labels))


Confusion Matrix:
[[225  23]
 [ 68  68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83       248
           1       0.75      0.50      0.60       136

    accuracy                           0.76       384
   macro avg       0.76      0.70      0.72       384
weighted avg       0.76      0.76      0.75       384



In [21]:

# Example single input row (unscaled)
##w_row = np.array([[6,148,72,35,94,3.6,.652,50]])  # Replace with actual values
new_row = np.array([[5,148,75,22,110,33.6,.627,67]])

# Scale the new row using the same scaler used during training
new_row_scaled = scaler.transform(new_row)

print("Scaled Input:", new_row_scaled)



Scaled Input: [[0.36946683 0.78582386 0.32600549 0.12607343 0.20815625 0.20607387
  0.46389334 2.92338235]]




In [22]:
# Convert scaled row to list for the SageMaker endpoint
single_payload = new_row_scaled.tolist()

# Get prediction from the endpoint
single_prediction = predictor.predict(single_payload)

print("Predicted Outcome:", single_prediction[0])  # 0 or 1

Predicted Outcome: 1
