In [1]:
import boto3
import pandas as pd
from io import StringIO

# Define your bucket and key (same as your Autopilot data)
bucket = "cardiovale-solutions-datascience-pipeline"
key = "feature-store/cardio/cardio-feature-group-22-21-14-34/autopilot_input.csv"

# Create boto3 client and download
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
df_with_split = pd.read_csv(obj['Body'])

print("✅ Data loaded from S3. Shape:", df_with_split.shape)


✅ Data loaded from S3. Shape: (70000, 19)


In [2]:
import pandas as pd
import boto3
from io import StringIO

# Load from S3
bucket = "cardiovale-solutions-datascience-pipeline"
key = "jumpstart/cardio_clean_noheader.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(obj["Body"], header=None)

# Infer & fix types
for col in df.columns:
    if df[col].dtype == "object":
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            df[col] = df[col].astype("category")

# Save cleaned version (no header for JumpStart)
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False, header=False)

clean_key = "jumpstart/cardio_clean_noheader_fixed.csv"
s3.put_object(Bucket=bucket, Key=clean_key, Body=csv_buffer.getvalue())

print(f"Cleaned file re-uploaded to: s3://{bucket}/{clean_key}")


✅ Clean file uploaded to: s3://cardiovale-solutions-datascience-pipeline/jumpstart/cardio_clean_noheader.csv


In [5]:
import sagemaker
from sagemaker.jumpstart.estimator import JumpStartEstimator

# 1. Setup essentials
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = "cardiovale-solutions-datascience-pipeline"
input_key = "jumpstart/cardio_clean_noheader.csv"
input_s3_uri = f"s3://{bucket}/{input_key}"

# 2. Use JumpStart model ID for XGBoost (or replace with other tabular model)
model_id = "xgboost-classification-model"  # pretrained JumpStart XGBoost

# 3. Define training input location
input_data = {
    "training": input_s3_uri
}

# 4. Configure the JumpStart Estimator
estimator = JumpStartEstimator(
    model_id=model_id,
    role=role,
    instance_type="ml.m5.large",
    sagemaker_session=sagemaker_session,
)

# 5. Launch training job
estimator.fit(inputs=input_data)


Using model 'xgboost-classification-model' with wildcard version identifier '*'. You can pin to version '2.1.7' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
INFO:sagemaker:Creating training-job with name: xgb-classification-model-2025-03-30-23-13-25-609


2025-03-30 23:13:28 Starting - Starting the training job...
2025-03-30 23:13:46 Starting - Preparing the instances for training...
2025-03-30 23:14:04 Downloading - Downloading input data...
2025-03-30 23:14:50 Downloading - Downloading the training image......
2025-03-30 23:15:51 Training - Training image download completed. Training in progress..[34m[2025-03-30 23:15:55.822 algo-1:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-30 23:15:55.844 algo-1:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-03-30:23:15:56:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-30:23:15:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-03-30:23:15:56:INFO] Invoking user training script.[0m
[34m[2025-03-30:23:15:56:INFO] Module transfer_learning does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2025-03-30:23:15:56:INFO] Generating setup.cfg[0m
[34m[2025-03-30:23:15:56:I

UnexpectedStatusException: Error for Training job xgb-classification-model-2025-03-30-23-13-25-609: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/miniconda3/bin/python3 -m transfer_learning --colsample_bytree 1 --early_stopping_rounds 30 --gamma 0 --learning_rate 0.3 --max_depth 6 --min_child_weight 1 --num_boost_round 5000 --reg_alpha 0 --reg_lambda 1 --subsample 1", exit code: 1

In [4]:
# 1. Load and clean dataset for JumpStart
import boto3
import pandas as pd
from io import StringIO

# Load Autopilot dataset from S3
bucket = "cardiovale-solutions-datascience-pipeline"
key = "feature-store/cardio/cardio-feature-group-22-21-14-34/autopilot_input.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
df_with_split = pd.read_csv(obj["Body"])
print("✅ Data loaded from S3. Shape:", df_with_split.shape)

# Drop split_type column and reorder cardio to be first
df = df_with_split.drop(columns=["split_type"], errors="ignore").copy()
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index("cardio")))
df = df[cols]

# Fix data types: convert object → numeric or category
for col in df.columns:
    if df[col].dtype == "object":
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            df[col] = df[col].astype("category")

print("\n✅ Final cleaned types:\n", df.dtypes)

# Save to CSV (no header, as required by JumpStart)
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False, header=False)

# Upload clean file to S3
clean_key = "jumpstart/cardio_clean_noheader_fixed.csv"
s3.put_object(Bucket=bucket, Key=clean_key, Body=csv_buffer.getvalue())
print(f"\n✅ Cleaned dataset re-uploaded to: s3://{bucket}/{clean_key}")


✅ Data loaded from S3. Shape: (70000, 19)

✅ Final cleaned types:
 cardio                        int64
id                            int64
age                           int64
gender                        int64
height                        int64
weight                      float64
ap_hi                         int64
ap_lo                         int64
cholesterol                   int64
gluc                          int64
smoke                         int64
alco                          int64
active                        int64
State                      category
offered_for_free            float64
medical_restrictions        float64
number_of_weeks_offered     float64
event_time                 category
dtype: object

✅ Cleaned dataset re-uploaded to: s3://cardiovale-solutions-datascience-pipeline/jumpstart/cardio_clean_noheader_fixed.csv


In [5]:
# 2. Launch JumpStart model training
import sagemaker
from sagemaker.jumpstart.estimator import JumpStartEstimator

# Setup
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()

model_id = "xgboost-classification-model"
input_s3_uri = f"s3://{bucket}/{clean_key}"

# JumpStart requires this input format
input_data = {
    "training": input_s3_uri
}

# Estimator
estimator = JumpStartEstimator(
    model_id=model_id,
    role=role,
    instance_type="ml.m5.large",
    sagemaker_session=sagemaker_session
)

print("🚀 Launching JumpStart model training...")
estimator.fit(inputs=input_data)


Using model 'xgboost-classification-model' with wildcard version identifier '*'. You can pin to version '2.1.7' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


🚀 Launching JumpStart model training...


INFO:sagemaker:Creating training-job with name: xgb-classification-model-2025-03-31-01-18-27-080


2025-03-31 01:18:29 Starting - Starting the training job...
2025-03-31 01:18:46 Starting - Preparing the instances for training...
2025-03-31 01:19:07 Downloading - Downloading input data...
2025-03-31 01:19:57 Downloading - Downloading the training image......
2025-03-31 01:20:58 Training - Training image download completed. Training in progress..[34m[2025-03-31 01:21:01.089 algo-1:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-31 01:21:01.112 algo-1:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-03-31:01:21:01:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-31:01:21:01:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-03-31:01:21:01:INFO] Invoking user training script.[0m
[34m[2025-03-31:01:21:02:INFO] Module transfer_learning does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2025-03-31:01:21:02:INFO] Generating setup.cfg[0m
[34m[2025-03-31:01:21:02:I

UnexpectedStatusException: Error for Training job xgb-classification-model-2025-03-31-01-18-27-080: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/miniconda3/bin/python3 -m transfer_learning --colsample_bytree 1 --early_stopping_rounds 30 --gamma 0 --learning_rate 0.3 --max_depth 6 --min_child_weight 1 --num_boost_round 5000 --reg_alpha 0 --reg_lambda 1 --subsample 1", exit code: 1