In [None]:
#Predicting whether an order should be sent to a technical approver


In [None]:
#Part 1: Load and examine the data

In [None]:
!pip install awswrangler
#Load data with AWS Data Wrangler (fast & IAM-aware)

Collecting awswrangler
  Downloading awswrangler-3.14.0-py3-none-any.whl.metadata (16 kB)
Downloading awswrangler-3.14.0-py3-none-any.whl (380 kB)
Installing collected packages: awswrangler
Successfully installed awswrangler-3.14.0


In [3]:
# Cell 2 – Imports & session
import pandas as pd
import awswrangler as wr
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sklearn.model_selection import train_test_split

role = get_execution_role()
sess = sagemaker.Session()
region = boto3.Session().region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:

data_bucket = "machine-learning-for-interview"
subfolder   = "chapter-02"
dataset     = "orders_with_predicted_value.csv"

# Verify bucket exists and you can list it
!aws s3 ls s3://{data_bucket}/{subfolder}/

2025-11-14 18:09:03          0 
2025-11-15 06:30:13      37377 orders_with_predicted_value.csv


In [7]:
import sagemaker
role = sagemaker.get_execution_role()
print(role)  # Outputs the full ARN


arn:aws:iam::585008073988:role/service-role/AmazonSageMaker-ExecutionRole-20251114T120602


In [8]:
# OSError: When getting information for key 'chapter-02/orders_with_predicted_value.csv' in bucket 
#'machine-learning-for-interview': AWS Error ACCESS_DENIED during HeadObject operation: No response body.
# I added a permission and now I'm testing it.

In [9]:

s3 = boto3.client('s3')
bucket = 'machine-learning-for-interview'

# Test bucket existence
try:
    s3.head_bucket(Bucket=bucket)
    print("✅ Bucket accessible")
except Exception as e:
    print("❌ Bucket error:", e)

# Test object list (checks chapter-02/)
try:
    response = s3.list_objects_v2(Bucket=bucket, Prefix='chapter-02/')
    print("✅ Can list objects. Files found:", len(response.get('Contents', [])))
except Exception as e:
    print("❌ List error:", e)

# Test HeadObject on your file
try:
    s3.head_object(Bucket=bucket, Key='chapter-02/orders_with_predicted_value.csv')
    print("✅ File metadata accessible")
except Exception as e:
    print("❌ File access error:", e)

✅ Bucket accessible
✅ Can list objects. Files found: 2
✅ File metadata accessible


In [10]:
# Cell 4 – Load data with AWS Data Wrangler (fast & IAM-aware)
df = wr.s3.read_csv(path=f"s3://{data_bucket}/{subfolder}/{dataset}")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()

Rows: 1000, Columns: 7


Unnamed: 0,tech_approval_required,requester_id,role,product,quantity,price,total
0,0,E2300,tech,Desk,1,664,664
1,0,E2300,tech,Keyboard,9,649,5841
2,0,E2374,non-tech,Keyboard,1,821,821
3,1,E2374,non-tech,Desktop Computer,24,655,15720
4,0,E2327,non-tech,Desk,1,758,758


In [11]:
# Cell 5 – Quick EDA (same as book)
print("\nTarget distribution:")
print(df["tech_approval_required"].value_counts())


Target distribution:
tech_approval_required
0    807
1    193
Name: count, dtype: int64


In [None]:
# Part 2 – Feature engineering (unchanged logic)



In [13]:
# One-hot encode everything (the book does this)
encoded = pd.get_dummies(df, drop_first=True)

# Correlation filter > 0.1 with target
target = "tech_approval_required"   # after get_dummies the column becomes this
corrs = encoded.corr()[target].abs()
columns = corrs[corrs > 0.1].index.tolist()
encoded = encoded[columns]

encoded.head()

Unnamed: 0,tech_approval_required,role_tech,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,True,False,True,False,False,False,False
1,0,True,False,False,False,True,False,False
2,0,False,False,False,False,True,False,False
3,1,False,False,False,True,False,False,False
4,0,False,False,True,False,False,False,False


In [None]:
# Part 3 – Train / Val / Test split & upload



In [25]:
# Split (unchanged)
train_df, val_test_df = train_test_split(encoded, test_size=0.3, random_state=0, stratify=encoded[target])
val_df,   test_df     = train_test_split(val_test_df, test_size=0.333, random_state=0, stratify=val_test_df[target])

def to_sagemaker_csv(df, label_col, include_header=False):
    label = df[label_col].astype(int)
    feats = df.drop(columns=[label_col])
    out = pd.concat([label, feats], axis=1)
    return out.to_csv(None, header=include_header, index=False).encode('utf-8')

train_csv = to_sagemaker_csv(train_df, target, include_header=False)
val_csv   = to_sagemaker_csv(val_df,   target, include_header=False)
test_csv  = to_sagemaker_csv(test_df,  target, include_header=True)

# --- UPLOAD USING boto3 (GUARANTEED TO WORK) ---
s3_client = boto3.client('s3')
processed_prefix = f"{subfolder}/processed"

s3_client.put_object(
    Bucket=data_bucket,
    Key=f"{processed_prefix}/train.csv",
    Body=train_csv
)

s3_client.put_object(
    Bucket=data_bucket,
    Key=f"{processed_prefix}/val.csv",
    Body=val_csv
)

s3_client.put_object(
    Bucket=data_bucket,
    Key=f"{processed_prefix}/test.csv",
    Body=test_csv
)

print("Uploaded train / val / test correctly using boto3")

Uploaded train / val / test correctly using boto3


In [None]:
#Part 4 – Train XGBoost (SageMaker SDK v2)



In [26]:
# XGBoost container (latest stable)
image_uri = sagemaker.image_uris.retrieve(framework="xgboost", region=region, version="1.7-1")

estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",          # m4 retired → m5/c5
    output_path=f"s3://{data_bucket}/{subfolder}/output",
    sagemaker_session=sess,
    hyperparameters={
        "max_depth": "5",
        "subsample": "0.7",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "num_round": "100",
        "early_stopping_rounds": "10"
    }
)

# TrainingInput replaces s3_input
train_input = TrainingInput(s3_data=f"s3://{data_bucket}/{processed_prefix}/train.csv", content_type="csv")
val_input   = TrainingInput(s3_data=f"s3://{data_bucket}/{processed_prefix}/val.csv",   content_type="csv")

estimator.fit({"train": train_input, "validation": val_input})
print("Training finished")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-15-14-49-20-167


2025-11-15 14:49:21 Starting - Starting the training job...
2025-11-15 14:49:36 Starting - Preparing the instances for training...
2025-11-15 14:50:23 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m[2025-11-15 14:51:30.086 ip-10-2-92-83.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-15 14:51:30.158 ip-10-2-92-83.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-15:14:51:30:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-15:14:51:30:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-15:14:51:30:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-15:14:51:30:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-15:14:51:30:INFO] Running XGBoost Sagemaker in algorithm mod

In [None]:
#Part 5 – Deploy endpoint



In [34]:
import boto3

sagemaker = boto3.client('sagemaker')
account_id = boto3.client('sts').get_caller_identity()['Account']
region = boto3.session.Session().region_name

print(f"Account: {account_id} | Region: {region}\n")

# --- List all endpoints ---
print("ENDPOINTS:")
try:
    endpoints = sagemaker.list_endpoints()['Endpoints']
    if endpoints:
        for ep in endpoints:
            name = ep['EndpointName']
            status = ep['EndpointStatus']
            config = ep.get('EndpointConfigName', 'N/A')
            print(f"  - {name} | Status: {status} | Config: {config}")
    else:
        print("  No endpoints found.")
except Exception as e:
    print("  Error listing endpoints:", e)

print("\n" + "-"*60 + "\n")

# --- List all endpoint configs ---
print("ENDPOINT CONFIGS:")
try:
    configs = sagemaker.list_endpoint_configs()['EndpointConfigs']
    if configs:
        for cfg in configs:
            name = cfg['EndpointConfigName']
            created = cfg['CreationTime'].strftime("%Y-%m-%d %H:%M")
            print(f"  - {name} | Created: {created}")
    else:
        print("  No endpoint configs found.")
except Exception as e:
    print("  Error listing configs:", e)

Account: 585008073988 | Region: us-east-1

ENDPOINTS:
  No endpoints found.

------------------------------------------------------------

ENDPOINT CONFIGS:
  - order-approval-2025 | Created: 2025-11-15 14:21


In [35]:
endpoint_name = "order-approval-2025"
sagemaker = boto3.client('sagemaker')

# === DELETE ENDPOINT CONFIG ONLY ===
try:
    sagemaker.delete_endpoint_config(EndpointConfigName=endpoint_name)
    print(f"Deleted endpoint config: {endpoint_name}")
except Exception as e:
    if 'not found' in str(e).lower():
        print("No config to delete")
    else:
        raise

# === DEPLOY FRESH ===
print("Deploying new endpoint...")
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name=endpoint_name,
    serializer=CSVSerializer()
)

print(f"Endpoint '{endpoint_name}' is ready!")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-15-15-07-48-112


Deleted endpoint config: order-approval-2025
Deploying new endpoint...


INFO:sagemaker:Creating endpoint-config with name order-approval-2025
INFO:sagemaker:Creating endpoint with name order-approval-2025


------!Endpoint 'order-approval-2025' is ready!


In [None]:
#Part 6 – Test the model (batch style – faster)



In [37]:
test_df = wr.s3.read_csv(path=f"s3://{data_bucket}/{processed_prefix}/test.csv")

print("Columns in test_df:")
print(test_df.columns.tolist())
print("\nFirst few rows:")
print(test_df.head())

Columns in test_df:
['tech_approval_required', 'role_tech', 'product_Cleaning', 'product_Desk', 'product_Desktop Computer', 'product_Keyboard', 'product_Laptop Computer', 'product_Mouse']

First few rows:
   tech_approval_required  role_tech  product_Cleaning  product_Desk  \
0                       0      False             False         False   
1                       0      False             False          True   
2                       0      False             False         False   
3                       0      False              True         False   
4                       0      False             False         False   

   product_Desktop Computer  product_Keyboard  product_Laptop Computer  \
0                     False              True                    False   
1                     False             False                    False   
2                     False             False                    False   
3                     False             False                    F

In [39]:
# Load test data
test_df = wr.s3.read_csv(f"s3://{data_bucket}/{processed_prefix}/test.csv")

# Separate label and features
label_col = "tech_approval_required"
X_test = test_df.drop(columns=[label_col])
y_true = test_df[label_col]

# CRITICAL: Convert ALL features to float
# This handles booleans, strings, etc.
X_test_numeric = X_test.astype(float).values  # <-- THIS LINE FIXES IT

# Predict
raw_pred = predictor.predict(X_test_numeric).decode("utf-8")
pred_series = pd.Series([float(p) for p in raw_pred.split("\n") if p.strip()])
pred_series = (pred_series > 0.5).astype(int)

# Accuracy
accuracy = (pred_series == y_true).mean()
print(f"Test accuracy: {accuracy:.1%}")

# Show first 10 rows
result = pd.DataFrame({
    "prediction": pred_series,
    "true": y_true
}).reset_index(drop=True)
result.head(10)

Test accuracy: 81.0%


Unnamed: 0,prediction,true
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,1
7,0,0
8,0,0
9,0,1


In [40]:
# Comment out if you want to keep the endpoint
sess.delete_endpoint(endpoint_name)
!aws s3 rm s3://{data_bucket}/{subfolder}/output --recursive   # optional

INFO:sagemaker:Deleting endpoint with name: order-approval-2025


delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/debug-output/collections/000000000/worker_0_collections.json
delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/debug-output/claim.smd
delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/profiler-output/system/incremental/2025111514/1763218260.algo-1.json
delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/debug-output/events/000000000000/000000000000_worker_0.tfevents
delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/profiler-output/system/incremental/2025111514/1763218200.algo-1.json
delete: s3://machine-learning-for-interview/chapter-02/output/sagemaker-xgboost-2025-11-15-14-49-20-167/profiler-output/system/training_job_end.ts
delete: s3://machine-learning-for-interview/cha