In [15]:
import json
import pandas as pd
import boto3

s3 = boto3.client("s3")

bucket_rosters = "hospital-rosters-history-expendables-us-20250918"
prefix_rosters = "historical/"

bucket_nurses = "hospital-roster-data"
prefix_nurses = "raw_data/nurse_data/"

# --- Helper: shift hours ---
def shift_hours(shift_name):
    if not shift_name:
        return 8
    return 4 if "Half" in shift_name else 8

# --- List nurse data files ---
nurse_files = []
paginator = s3.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket_nurses, Prefix=prefix_nurses):
    for obj in page.get("Contents", []):
        if obj["Key"].lower().endswith(".json"):
            nurse_files.append(obj["Key"])
nurse_files.sort()  # ensure order matches rosters

print(f"Found {len(nurse_files)} nurse datasets")

# --- List roster files ---
roster_files = []
paginator = s3.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket_rosters, Prefix=prefix_rosters):
    for obj in page.get("Contents", []):
        if obj["Key"].lower().endswith(".json"):
            roster_files.append(obj["Key"])
roster_files.sort()

print(f"Found {len(roster_files)} roster snapshots")

# --- Sanity check ---
if len(nurse_files) != len(roster_files):
    raise ValueError("Mismatch: # of nurse datasets != # of rosters")

# --- Build dataset ---
rows = []
for nurse_key, roster_key in zip(nurse_files, roster_files):
    # load nurse master
    obj = s3.get_object(Bucket=bucket_nurses, Key=nurse_key)
    nurse_list = json.loads(obj["Body"].read().decode("utf-8"))
    nurse_master = {n["nurse_id"]: n for n in nurse_list}

    # load roster
    body = s3.get_object(Bucket=bucket_rosters, Key=roster_key)["Body"].read().decode("utf-8")
    roster_json = json.loads(body)

    # build a weekly lookup: nurse -> their shifts
    nurse_shift_map = {}
    for dept in roster_json.get("departments", []):
        for n in dept.get("nurses", []):
            nurse_shift_map.setdefault(n["id"], []).extend(n.get("shifts", []))

    # build pairwise rows
    for dept in roster_json.get("departments", []):
        dept_name = dept.get("name")
        for n in dept.get("nurses", []):
            assigned_nurse_id = n.get("id")
            for s in n.get("shifts", []):
                day = s.get("day")
                shift_type = s.get("shift")
                sh_hours = shift_hours(shift_type)

                # candidate pool = all nurses in this nurse dataset
                for cand_id, cand_data in nurse_master.items():
                    cand_shifts = nurse_shift_map.get(cand_id, [])

                    # weekly + daily hours
                    total_hours_week = sum(shift_hours(cs["shift"]) for cs in cand_shifts)
                    total_hours_day = sum(
                        shift_hours(cs["shift"])
                        for cs in cand_shifts if cs["day"] == day
                    )

                    # check rest day: any day off?
                    days_worked = {cs["day"] for cs in cand_shifts}
                    has_rest_day = 1 if len(days_worked) < 7 else 0

                    row = {
                        "nurse_key": nurse_key,
                        "roster_key": roster_key,
                        "dept": dept_name,
                        "day": day,
                        "shift_type": shift_type,
                        "target_nurse": assigned_nurse_id,
                        "candidate_nurse": cand_id,
                        "label": 1 if cand_id == assigned_nurse_id else 0,
                        # static features
                        "cand_experience": cand_data.get("experience_years"),
                        "cand_seniority": cand_data.get("seniority_level"),
                        "cand_hours_contract": cand_data.get("contracted_hours"),
                        "cand_pref": ",".join(cand_data.get("preferences", [])),
                        "cand_skills": ",".join(cand_data.get("skills", [])),
                        # compliance features
                        "hours_in_week": total_hours_week,
                        "would_violate_45": 1 if total_hours_week + sh_hours > 45 else 0,
                        "would_violate_8_per_day": 1 if total_hours_day + sh_hours > 8 else 0,
                        "has_rest_day": has_rest_day,
                    }
                    rows.append(row)

df = pd.DataFrame(rows)
print("Dataset shape:", df.shape)
print(df.head())

# Save dataset to S3
local_path = "/tmp/pairwise_weekly_compliance.parquet"
df.to_parquet(local_path, index=False)
s3.upload_file(local_path, bucket_nurses, "training/pairwise_weekly_compliance.parquet")
print("✅ Saved dataset to S3")


Found 95 nurse datasets
Found 95 roster snapshots
Dataset shape: (1473200, 17)
                         nurse_key               roster_key     dept  day  \
0  raw_data/nurse_data/nurse1.json  historical/roster1.json  General  Tue   
1  raw_data/nurse_data/nurse1.json  historical/roster1.json  General  Tue   
2  raw_data/nurse_data/nurse1.json  historical/roster1.json  General  Tue   
3  raw_data/nurse_data/nurse1.json  historical/roster1.json  General  Tue   
4  raw_data/nurse_data/nurse1.json  historical/roster1.json  General  Tue   

   shift_type target_nurse candidate_nurse  label  cand_experience  \
0  Full-Night         N003            N001      0                6   
1  Full-Night         N003            N002      0                2   
2  Full-Night         N003            N003      1                4   
3  Full-Night         N003            N004      0                1   
4  Full-Night         N003            N005      0                8   

  cand_seniority  cand_hours_contract

In [16]:
import pandas as pd
import boto3

s3 = boto3.client("s3")
bucket_nurses = "hospital-roster-data"   # same bucket as Cell 1

# ---- Load parquet dataset from S3 (output of Cell 1) ----
df = pd.read_parquet("s3://hospital-roster-data/training/pairwise_weekly_compliance.parquet")

# ---- Feature engineering ----
seniority_map = {"Junior": 0, "Mid": 1, "Senior": 2}
df["cand_seniority_num"] = df["cand_seniority"].map(seniority_map).fillna(0)
df["pref_morning"] = df["cand_pref"].str.contains("Morning", case=False, na=False).astype(int)
df["pref_evening"] = df["cand_pref"].str.contains("Evening", case=False, na=False).astype(int)
df["pref_night"]   = df["cand_pref"].str.contains("Night",   case=False, na=False).astype(int)

for skill in ["ER", "General", "ICU", "OT", "Pediatrics"]:
    df[f"skill_{skill}"] = df["cand_skills"].str.contains(skill, case=False, na=False).astype(int)

feature_cols = [
    "cand_experience", "cand_hours_contract", "cand_seniority_num",
    "pref_morning", "pref_evening", "pref_night",
    "skill_ER", "skill_General", "skill_ICU", "skill_OT", "skill_Pediatrics",
    "hours_in_week", "would_violate_45", "would_violate_8_per_day", "has_rest_day"
]

df_out = df[feature_cols + ["label"]]

# ---- Train/test split ----
train = df_out.sample(frac=0.8, random_state=42)
test = df_out.drop(train.index)

# ---- Save locally as CSV ----
train_path = "/tmp/train.csv"
test_path = "/tmp/test.csv"
train.to_csv(train_path, header=False, index=False)
test.to_csv(test_path, header=False, index=False)

# ---- Upload to S3 ----
prefix = "xgboost/data"
s3.upload_file(train_path, bucket_nurses, f"{prefix}/train/train.csv")
s3.upload_file(test_path, bucket_nurses, f"{prefix}/test/test.csv")

print(f"✅ Train/Test CSV uploaded to s3://{bucket_nurses}/{prefix}/")


✅ Train/Test CSV uploaded to s3://hospital-roster-data/xgboost/data/


In [17]:
import sagemaker
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = "hospital-roster-data"
prefix = "xgboost/data"

# ---- XGBoost container image ----
xgb_image = image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

# ---- Estimator definition ----
xgb_estimator = Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",   # upgrade if needed
    output_path=f"s3://{bucket}/xgboost/output",
    sagemaker_session=session
)

# ---- Hyperparameters ----
xgb_estimator.set_hyperparameters(
    objective="rank:pairwise",
    eval_metric="ndcg",
    num_round=100,
    max_depth=6,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8
)

# ---- Input channels ----
train_input = TrainingInput(f"s3://{bucket}/{prefix}/train/", content_type="csv")
validation_input = TrainingInput(f"s3://{bucket}/{prefix}/test/", content_type="csv")

# ---- Launch training job ----
xgb_estimator.fit({"train": train_input, "validation": validation_input})


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-09-20-01-07-22-211


2025-09-20 01:07:27 Starting - Starting the training job...
2025-09-20 01:07:42 Starting - Preparing the instances for training...
2025-09-20 01:08:04 Downloading - Downloading input data...
2025-09-20 01:08:29 Downloading - Downloading the training image...
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-09-20 01:09:27.183 ip-10-0-121-66.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-09-20 01:09:27.205 ip-10-0-121-66.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-09-20:01:09:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-09-20:01:09:27:INFO] Failed to parse hyperparameter eval_metric value ndcg to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-20:01:09:27:INFO] Failed to parse hyperparameter objective value rank:pairwise to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-20:01:09:27:INFO] No GPUs detected (normal if no gpus installed)

In [18]:
import pandas as pd
import xgboost as xgb
import boto3

s3 = boto3.client("s3")

# Reload dataset (raw features only)
df = pd.read_parquet("/tmp/pairwise_weekly_compliance.parquet")

# --- Recreate preprocessing (same as in Cell 2) ---
seniority_map = {"Junior": 0, "Mid": 1, "Senior": 2}
df["cand_seniority_num"] = df["cand_seniority"].map(seniority_map).fillna(0)

df["pref_morning"] = df["cand_pref"].str.contains("Morning", case=False, na=False).astype(int)
df["pref_evening"] = df["cand_pref"].str.contains("Evening", case=False, na=False).astype(int)
df["pref_night"]   = df["cand_pref"].str.contains("Night",   case=False, na=False).astype(int)

for skill in ["ER", "General", "ICU", "OT", "Pediatrics"]:
    df[f"skill_{skill}"] = df["cand_skills"].str.contains(skill, case=False, na=False).astype(int)

# Same feature set used for training
feature_cols = [
    "cand_experience", "cand_hours_contract", "cand_seniority_num",
    "pref_morning", "pref_evening", "pref_night",
    "skill_ER", "skill_General", "skill_ICU", "skill_OT", "skill_Pediatrics",
    "hours_in_week", "would_violate_45", "would_violate_8_per_day", "has_rest_day"
]

print("✅ Features rebuilt for inference")

# --- Download model from S3 ---
bucket = "hospital-roster-models"
model_key = "xgboost/nurse_roster_ranker.json"
local_model_path = "/tmp/nurse_roster_ranker.json"

s3.download_file(bucket, model_key, local_model_path)

# --- Load model ---
model = xgb.Booster()
model.load_model(local_model_path)

# --- Run batch predictions ---
X = df[feature_cols]
dmat = xgb.DMatrix(X)
df["score"] = model.predict(dmat)

# --- Save predictions back to S3 ---
pred_path = "/tmp/nurse_predictions.parquet"
df.to_parquet(pred_path, index=False)

s3.upload_file(pred_path, "hospital-roster-model", "predictions/nurse_predictions.parquet")
print("✅ Predictions saved to S3: s3://hospital-roster-model/predictions/nurse_predictions.parquet")


✅ Features rebuilt for inference
✅ Predictions saved to S3: s3://hospital-roster-data/predictions/nurse_predictions.parquet
