# Cell 1: Imports & Configuration

Set up libraries, boto3/sagemaker clients, and variables used by all later cells (edit the bucket name accordingly)


In [None]:
import boto3, io, pandas as pd
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import get_execution_role

region = "us-east-1"  # update if needed
bucket = "healthcare-project-data-jayesh-devre"
role = get_execution_role()
s3 = boto3.client("s3", region_name=region)

hist_key = "raw/historical/heart_attack_prediction_dataset.csv"


# Cell 2: Load Processed Dataset & Preview

Load the processed/merged CSV from S3 (produced by EMR) to confirm shape & columns.


In [None]:
obj = s3.get_object(Bucket=bucket, Key=hist_key)
df = pd.read_csv(io.BytesIO(obj["Body"].read()))

print("Loaded dataset:", df.shape)
df.head(2)


# Cell 3: Function to Preprocess the data


In [None]:
def preprocess_health_data(df):
    # Split blood pressure
    if "Blood Pressure" in df.columns:
        bp = df["Blood Pressure"].astype(str).str.split("/", n=1, expand=True)
        df["BP_Systolic"] = pd.to_numeric(bp[0], errors="coerce")
        df["BP_Diastolic"] = pd.to_numeric(bp[1], errors="coerce")
        df.drop(columns=["Blood Pressure"], inplace=True)
    
    # Drop identifiers
    df = df.drop(columns=["Patient ID", "Country", "Continent", "Hemisphere"], errors="ignore")
    
    # One-hot encode categoricals
    df = pd.get_dummies(df, drop_first=True).fillna(0)
    
    return df


# Cell 4: Preprocessing and then splitting dataset into train/test


In [None]:
proc_df = preprocess_health_data(df)
y = proc_df["Heart Attack Risk"].astype(int)
X = proc_df.drop(columns=["Heart Attack Risk"])

final_df = pd.concat([y, X], axis=1)
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42, stratify=y)

print("Train:", train_df.shape, "| Test:", test_df.shape)
print("\n Sample training row:")
display(train_df.head(1))


# Cell 5: Upload the train/test dataset into CSV


In [None]:
train_key = "raw/historical/train/train.csv"
test_key = "raw/historical/test/test.csv"

def upload_csv(df, key):
    s3.put_object(Bucket=bucket, Key=key, Body=df.to_csv(index=False, header=False).encode())
    print(f"Uploaded → s3://{bucket}/{key}")

upload_csv(train_df, train_key)
upload_csv(test_df, test_key)


# Cell 6: Upload feature list to S3

Upload feature list to S3 so that this can be used by the simulated Processed data use the trained model


In [None]:
feature_list = list(X.columns)
with open("feature_list.txt", "w") as f:
    f.write("\n".join(feature_list))

!aws s3 cp feature_list.txt s3://{bucket}/preprocess/feature_list.txt
print(f" Uploaded feature list → s3://{bucket}/preprocess/feature_list.txt")


# Cell 7: Train the model


In [None]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
import time

timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
output_path = f"s3://{bucket}/models/xgboost"

xgb_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")

xgb_estimator = Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=5,
    output_path=output_path,
    base_job_name=f"xgboost-heart-attack-{timestamp}",
)

xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc"
)

train_input = f"s3://{bucket}/{train_key}"
test_input = f"s3://{bucket}/{test_key}"

print(" Starting training job.")
xgb_estimator.fit(
    {
        "train": TrainingInput(train_input, content_type="text/csv"),
        "validation": TrainingInput(test_input, content_type="text/csv")
    }
)

model_artifact = xgb_estimator.model_data
print(" Model training complete!")
print(" Model artifact stored at:", model_artifact)


# Cell 8: Deploy the Model


In [None]:
from sagemaker.model import Model
import sagemaker, time

sagemaker_session = sagemaker.session.Session()
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")

xgb_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")

# Define the model object
xgb_model = Model(
    image_uri=xgb_image,
    model_data=model_artifact,
    role=role,
    name=f"xgb-heart-attack-{timestamp}",
    sagemaker_session=sagemaker_session,
)

# Create a custom endpoint name
endpoint_name = f"xgb-heart-attack-endpoint-{timestamp}"

print(f" Deploying XGBoost model as endpoint: {endpoint_name} .")

# Deploy using the model's .deploy() — returns None in newer SDKs,
# so we attach a Predictor manually afterward
xgb_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name
)

# Manually create predictor for runtime access
from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)

print("\n Model deployed successfully!")
print(" Endpoint name:", endpoint_name)
