In [2]:
import sagemaker

# Get the current SageMaker session and region
sagemaker_session = sagemaker.Session()
aws_region = sagemaker_session.boto_region_name

print(f"AWS Region: {aws_region}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
AWS Region: us-east-1


In [3]:
import boto3
import time
# 1. Choose a unique bucket name
# S3 bucket names must be globally unique!


unique_id = int(time.time())
bucket_name = f"sagemaker-project-{aws_region}-{unique_id}"
print(f"Creating bucket: {bucket_name}")

# 2. Create an S3 client
s3_client = boto3.client("s3", region_name=aws_region)

# 3. Define the bucket configuration
# The 'LocationConstraint' is necessary for all regions *except* us-east-1
if aws_region == "us-east-1":
    s3_client.create_bucket(Bucket=bucket_name)
else:
    create_bucket_configuration = {
        'LocationConstraint': aws_region
    }
    s3_client.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration=create_bucket_configuration
    )

print(f"Successfully created bucket: {bucket_name}")

Creating bucket: sagemaker-project-us-east-1-1763116930
Successfully created bucket: sagemaker-project-us-east-1-1763116930


In [None]:
import pandas as pd
import boto3


bucket_name = 'sagemaker-project-us-east-1-1763116930'



# --- 1. Load the local file ---

# vvv MAKE SURE THIS IS YOUR UPLOADED FILE'S NAME vvv
local_file_path = "price_paid_model_ready.parquet" # <-- CHANGE TO YOUR UPLOADED FILE'S NAME
# ^^^ MAKE SURE THIS IS YOUR UPLOADED FILE'S NAME ^^^

try:
    print(f"Loading full dataset from local file: '{local_file_path}'")
    df_full = pd.read_parquet(local_file_path)
    print(f"Loaded {len(df_full)} total records.")

except FileNotFoundError:
    print(f"Error: The file '{local_file_path}' was not found in your notebook environment.")
    print("This means the upload may have failed or the file name is misspelled.")
    raise

# --- 2. Split the data by Year (Time-Series Split) ---
#
# Using your 'sale_year' column.

# *** YOU MUST CHANGE THIS ***
year_column = 'sale_year' # <-- This is your column name
split_year = 2014         # <-- CHANGE TO YOUR DESIRED SPLIT YEAR
                          # This means years BEFORE 2021 are for training,
                          # and 2021 AND LATER are for validation.

try:
    print(f"Splitting data on column '{year_column}' before year {split_year}...")
    
    # Check if the column is numeric. If it's text, we try to convert it.
    if not pd.api.types.is_numeric_dtype(df_full[year_column]):
        print(f"Warning: Column '{year_column}' is not numeric. Attempting to convert to integer...")
        df_full[year_column] = df_full[year_column].astype(int)

    # Simple integer comparison for the split
    df_train = df_full[df_full[year_column] < split_year]
    df_validation = df_full[df_full[year_column] >= split_year]
    
    if len(df_train) == 0 or len(df_validation) == 0:
        print(f"Warning: Your split year '{split_year}' or column '{year_column}' resulted in an empty set.")
        print(f"Training records: {len(df_train)}, Validation records: {len(df_validation)}")
    else:
        print(f"Split complete.")
        print(f"Training records:   {len(df_train)} (Years: {df_train[year_column].min()} - {df_train[year_column].max()})")
        print(f"Validation records: {len(df_validation)} (Years: {df_validation[year_column].min()} - {df_validation[year_column].max()})")


except KeyError:
    print(f"Error: The column '{year_column}' was not found in your Parquet file.")
    print("Please change the 'year_column' variable in this script to the correct column name.")
    raise
except Exception as e:
    print(f"An error occurred during the split: {e}")
    raise

# --- 3. Save the new split files locally ---
local_train_file = "train.parquet"
local_validation_file = "validation.parquet"

df_train.to_parquet(local_train_file, index=False)
df_validation.to_parquet(local_validation_file, index=False)
print("Local train.parquet and validation.parquet files created.")

# --- 4. Upload the new files to separate S3 prefixes ---
# (These variables should exist from the cell where you created the bucket)
# s3_client, bucket_name, aws_region

s3_prefix_train = "data/train"
s3_prefix_validation = "data/validation"

# These are the final paths you will give to the SageMaker training job
s3_input_train_path = f"s3://{bucket_name}/{s3_prefix_train}"
s3_input_validation_path = f"s3://{bucket_name}/{s3_prefix_validation}"

# Upload the TRAINING file
try:
    print(f"Uploading {local_train_file} to {s3_input_train_path}/")
    s3_client.upload_file(
        Filename=local_train_file,
        Bucket=bucket_name,
        Key=f"{s3_prefix_train}/train.parquet" # Saves it as data/train/train.parquet
    )

    # Upload the VALIDATION file
    print(f"Uploading {local_validation_file} to {s3_input_validation_path}/")
    s3_client.upload_file(
        Filename=local_validation_file,
        Bucket=bucket_name,
        Key=f"{s3_prefix_validation}/validation.parquet" # Saves it as data/validation/validation.parquet
    )
    
    print("\n--- All steps complete! ---")
    print(f"Your training data is ready at: {s3_input_train_path}")
    print(f"Your validation data is ready at: {s3_input_validation_path}")

except Exception as e:
    print(f"An error occurred during the S3 upload: {e}")

Loading full dataset from local file: 'price_paid_model_ready.parquet'


In [2]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
!pip install lightgbm
# --- 1. Correct bucket ---
bucket_name = "sagemaker-project-us-east-1-1763114558"

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# --- 2. Data paths ---
s3_input_train_path = f"s3://{bucket_name}/data/train"
s3_input_validation_path = f"s3://{bucket_name}/data/validation"

print(f"Using training data: {s3_input_train_path}")
print(f"Using validation data: {s3_input_validation_path}")
print(f"Using IAM Role: {role}")

# --- 3. Create the SageMaker Estimator ---
sklearn_estimator = SKLearn(
    entry_point="main.py",
    requirements_file="requirements.txt",   # installs lightgbm
    framework_version="1.2-1",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    py_version="py3",
    hyperparameters={
        "n_estimators": 200,
        "num_leaves": 31,
        "learning_rate": 0.05
    }
)

# --- 4. Define input channels ---
data_channels = {
    "train": s3_input_train_path,
    "validation": s3_input_validation_path
}

# --- 5. Launch training job ---
print("--- Starting LightGBM Training Job ---")
sklearn_estimator.fit(data_channels)

print("--- Training Complete ---")
print(f"Model artifacts saved to: {sklearn_estimator.model_data}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Collecting lightgbm
  Downloading lightgbm-4.6.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m64.4 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for lightgbm: filename=lightgbm-4.6.0-py3-none-linux_x86_64.whl size=2737777 sha256=3d0e867e27fd7a44bb6a2ab89a9fb543cb44a452fb685073674f0f2de663eee2
  Stored in directory: /home/ec2-user/.cache/pip/wheels/bb/db/6d/7814aed03437129dc284a055c084f201b765deb5

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2025-11-14-13-57-11-343


2025-11-14 13:57:13 Starting - Starting the training job...
2025-11-14 13:57:28 Starting - Preparing the instances for training...
2025-11-14 13:57:51 Downloading - Downloading input data...
2025-11-14 13:58:26 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m2025-11-14 13:59:36,665 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-11-14 13:59:36,671 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-11-14 13:59:36,674 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-11-14 13:59:36,693 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-11-14 13:59:37,002 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-11-14 13:59:37,006 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2

In [1]:
import boto3

# --- 1. CONFIGURE YOUR BUCKET NAME ---
# This MUST match the bucket you created
bucket_name = "sagemaker-project-us-east-1-1763114558"
# ---

# Define the prefixes we are checking
s3_prefix_train = "data/train"
s3_prefix_validation = "data/validation"

# Get the S3 client
s3_client = boto3.client("s3")

print(f"--- Checking for files... ---")

# --- 2. Check the TRAINING prefix ---
print(f"\nChecking for files in: s3://{bucket_name}/{s3_prefix_train}/")
try:
    response_train = s3_client.list_objects_v2(
        Bucket=bucket_name,
        Prefix=s3_prefix_train
    )
    
    # S3 list_objects_v2 returns the prefix itself as an object if it's empty
    # so we check for objects *other* than just the folder.
    files_found = [obj['Key'] for obj in response_train.get('Contents', []) if not obj['Key'].endswith('/')]
    
    if files_found:
        print("--- Found training files: ---")
        for file_key in files_found:
            print(f" - {file_key}")
    else:
        print("!!! NO TRAINING FILES FOUND in this prefix.")

except Exception as e:
    print(f"An error occurred listing training files: {e}")


# --- 3. Check the VALIDATION prefix ---
print(f"\nChecking for files in: s3://{bucket_name}/{s3_prefix_validation}/")
try:
    response_val = s3_client.list_objects_v2(
        Bucket=bucket_name,
        Prefix=s3_prefix_validation
    )
    
    files_found = [obj['Key'] for obj in response_val.get('Contents', []) if not obj['Key'].endswith('/')]
    
    if files_found:
        print("--- Found validation files: ---")
        for file_key in files_found:
            print(f" - {file_key}")
    else:
        print("!!! NO VALIDATION FILES FOUND in this prefix.")
        
except Exception as e:
    print(f"An error occurred listing validation files: {e}")

print("\n--- Check Complete ---")

--- Checking for files... ---

Checking for files in: s3://sagemaker-project-us-east-1-1763114558/data/train/
--- Found training files: ---
 - data/train/train.parquet

Checking for files in: s3://sagemaker-project-us-east-1-1763114558/data/validation/
--- Found validation files: ---
 - data/validation/validation.parquet

--- Check Complete ---
