# Push datasets and model to AWS S3

**Prerequisites**
- AWS CLI configured (`aws configure`) or env vars `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY`
- Bucket `larry-house-price-regression-data` exists in `ca-central-1`
- IAM user/role has **s3:PutObject** (and s3:GetObject if you want to read back) on the bucket

**If you get AccessDenied:** attach an IAM policy that allows `s3:PutObject` and `s3:GetObject` on `arn:aws:s3:::larry-house-price-regression-data/*`.

In [1]:
import boto3
from pathlib import Path

from botocore.exceptions import ClientError

bucket = "larry-house-price-regression-data"
region = "ca-central-1"

# Project root: works when cwd is project root or notebooks/
PROJECT_ROOT = Path.cwd() if (Path.cwd() / "data").exists() else Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data" / "processed"
MODEL_DIR = PROJECT_ROOT / "models"

s3 = boto3.client("s3", region_name=region)


def upload_to_s3(local_path: Path, s3_key: str) -> bool:
    """Upload a file to S3. Returns True on success, False on skip/error."""
    if not local_path.exists():
        print(f"File {local_path} does not exist. Skipping upload.")
        return False
    print(f"Uploading {local_path} to s3://{bucket}/{s3_key}...")
    try:
        s3.upload_file(str(local_path), bucket, s3_key)
        print(f"  Done.")
        return True
    except ClientError as e:
        if e.response["Error"]["Code"] == "AccessDenied":
            print(
                "  AccessDenied: Your IAM user/role needs s3:PutObject on this bucket.\n"
                "  Example policy (attach to user/role):\n"
                '  {"Effect": "Allow", "Action": ["s3:PutObject", "s3:GetObject"], '
                f'"Resource": "arn:aws:s3:::{bucket}/*"}}'
            )
        else:
            print(f"  Error: {e}")
        return False


# Processed datasets (from feature engineering / notebook 02)
upload_to_s3(DATA_DIR / "train_engineered.csv", "processed/train_engineered.csv")
upload_to_s3(DATA_DIR / "eval_engineered.csv", "processed/eval_engineered.csv")

# Model: try both names used in this repo
upload_to_s3(MODEL_DIR / "xgb_best_model.pkl", "models/xgb_best_model.pkl")
upload_to_s3(MODEL_DIR / "xgb_model.pkl", "models/xgb_model.pkl")

# Encoders (needed for inference)
upload_to_s3(MODEL_DIR / "freq_encoder.pkl", "models/freq_encoder.pkl")
upload_to_s3(MODEL_DIR / "target_encoder.pkl", "models/target_encoder.pkl")

Uploading /Users/larry/house-price-regression/data/processed/train_engineered.csv to s3://larry-house-price-regression-data/processed/train_engineered.csv...


S3UploadFailedError: Failed to upload /Users/larry/house-price-regression/data/processed/train_engineered.csv to larry-house-price-regression-data/processed/train_engineered.csv: An error occurred (AccessDenied) when calling the CreateMultipartUpload operation: User: arn:aws:iam::834000420058:user/Larry is not authorized to perform: s3:PutObject on resource: "arn:aws:s3:::larry-house-price-regression-data/processed/train_engineered.csv" because no identity-based policy allows the s3:PutObject action