# UK Housing Price Prediction - AWS SageMaker Linear Learner
**Author:** Abdul Salam Aldabik

## 1. Setup and Imports

In [None]:
# Install required packages (only if not already installed in SageMaker)
# !pip install sagemaker boto3 pandas scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import io
import os

print(f"SageMaker version: {sagemaker.__version__}")
print(f"Boto3 version: {boto3.__version__}")

ModuleNotFoundError: No module named 'sagemaker.predictor'

## 2. SageMaker Session and Role Setup

In [None]:
# Create SageMaker session and get execution role
sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()  # Default S3 bucket for this session
prefix = 'housing-price-prediction'  # S3 prefix for organizing data

print(f"SageMaker Role: {role}")
print(f"S3 Bucket: {bucket}")
print(f"S3 Prefix: {prefix}")
print(f"Region: {sess.boto_region_name}")

## 3. Load Data

**STEP 1:** Upload `housing_features_final.parquet` to SageMaker:
- Drag and drop the file into the JupyterLab file browser (left sidebar)
- Or click the Upload button (↑ icon) and select the file

**STEP 2:** Run the cell below to load the data

In [None]:
# Load the housing data
df = pd.read_parquet('housing_features_final.parquet')

print(f"✅ Dataset loaded: {df.shape}")
print(f"Columns: {len(df.columns)}")
df.head()

## 4. Prepare Features and Target

In [None]:
# Define target and features
target = 'log_price'  # or 'price' depending on your setup

# Columns to drop
drop_cols = [col for col in ['log_price', 'price', 'date_of_transfer', 'transaction_unique_identifier'] 
             if col in df.columns]

# Get feature columns
features = [col for col in df.columns if col not in drop_cols]

print(f"Target: {target}")
print(f"Number of features: {len(features)}")
print(f"Features: {features[:10]}...")  # Show first 10

In [None]:
# Handle any Period or non-numeric columns
period_cols = [col for col in df.columns if pd.api.types.is_period_dtype(df[col])]
if period_cols:
    print(f"Converting Period columns: {period_cols}")
    for col in period_cols:
        df[col] = df[col].astype(str)

# Convert any remaining object columns to numeric or drop them
X = df[features]
y = df[target]

# Ensure all features are numeric
for col in X.columns:
    if X[col].dtype == 'object':
        print(f"Converting {col} to numeric...")
        X[col] = pd.to_numeric(X[col], errors='coerce')

# Fill any NaN values
X = X.fillna(0)

print(f"\nFinal X shape: {X.shape}")
print(f"Final y shape: {y.shape}")
print(f"\nData types:\n{X.dtypes.value_counts()}")

## 5. Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 6. Prepare Data for SageMaker Linear Learner

SageMaker Linear Learner expects:
- CSV format with target in first column
- No header row
- Data uploaded to S3

In [None]:
# Combine target and features for SageMaker format (target first)
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Preview
print("\nFirst few rows of training data:")
train_data.head()

In [None]:
# Save to CSV without header and index
train_file = 'housing_train.csv'
test_file = 'housing_test.csv'

train_data.to_csv(train_file, header=False, index=False)
test_data.to_csv(test_file, header=False, index=False)

print(f"Saved {train_file} ({os.path.getsize(train_file) / 1024 / 1024:.2f} MB)")
print(f"Saved {test_file} ({os.path.getsize(test_file) / 1024 / 1024:.2f} MB)")

## 7. Upload Data to S3

In [None]:
# Upload training data to S3
train_s3_path = sess.upload_data(
    path=train_file,
    bucket=bucket,
    key_prefix=f'{prefix}/train'
)

# Upload test data to S3
test_s3_path = sess.upload_data(
    path=test_file,
    bucket=bucket,
    key_prefix=f'{prefix}/test'
)

print(f"Training data uploaded to: {train_s3_path}")
print(f"Test data uploaded to: {test_s3_path}")

## 8. Configure SageMaker Linear Learner

In [None]:
# Get the Linear Learner container image
from sagemaker.image_uris import retrieve

container = retrieve('linear-learner', sess.boto_region_name, version='latest')
print(f"Linear Learner container: {container}")

In [None]:
# Create Linear Learner estimator
linear = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',  # As per lab guide
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sess
)

# Set hyperparameters
linear.set_hyperparameters(
    feature_dim=X_train.shape[1],  # Number of features
    predictor_type='regressor',     # For regression (price prediction)
    mini_batch_size=100,
    epochs=10,
    learning_rate=0.01,
    normalize_data=True,
    normalize_label=True
)

print("Linear Learner estimator configured")
print(f"Feature dimension: {X_train.shape[1]}")

## 9. Train the Model

**⚠️ This will take 5-10 minutes and incur AWS charges!**

In [None]:
# Create training input channels
train_input = sagemaker.inputs.TrainingInput(
    s3_data=train_s3_path,
    content_type='text/csv'
)

test_input = sagemaker.inputs.TrainingInput(
    s3_data=test_s3_path,
    content_type='text/csv'
)

# Train the model
print("Starting training job...")
print("This will take approximately 5-10 minutes.")
print("\n" + "="*50)

linear.fit({'train': train_input, 'test': test_input})

print("\n" + "="*50)
print("✅ Training complete!")

## 10. Deploy the Model

**⚠️ Deploying creates an endpoint that incurs hourly charges. Remember to delete it when done!**

In [None]:
# Deploy the model to an endpoint
print("Deploying model to endpoint...")
print("This will take 5-10 minutes.")

predictor = linear.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

print(f"\n✅ Model deployed to endpoint: {predictor.endpoint_name}")

## 11. Make Predictions and Evaluate

In [None]:
# Helper function to predict in batches
def predict_batches(predictor, X, batch_size=500):
    """Predict in batches to avoid timeout issues"""
    predictions = []
    
    for i in range(0, len(X), batch_size):
        batch = X.iloc[i:i+batch_size]
        result = predictor.predict(batch.values)
        
        # Extract predictions from response
        batch_preds = [pred['score'] for pred in result['predictions']]
        predictions.extend(batch_preds)
        
        if (i // batch_size + 1) % 10 == 0:
            print(f"Processed {i + len(batch)}/{len(X)} samples...")
    
    return np.array(predictions)

print("Making predictions on test set...")
y_pred = predict_batches(predictor, X_test)
print(f"\nPredictions shape: {y_pred.shape}")

In [None]:
# Calculate metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("="*60)
print("AWS SageMaker Linear Learner - Model Performance")
print("="*60)
print(f"Mean Squared Error (MSE):     {mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"Mean Absolute Error (MAE):     {mae:,.2f}")
print(f"R² Score:                      {r2:.4f}")
print("="*60)

# If using log_price, show actual price RMSE
if target == 'log_price':
    actual_prices = np.exp(y_test)
    predicted_prices = np.exp(y_pred)
    actual_rmse = np.sqrt(mean_squared_error(actual_prices, predicted_prices))
    print(f"\nActual Price RMSE: £{actual_rmse:,.2f}")
    print("="*60)

## 12. Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Predicted vs Actual
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.3, s=1)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Predicted vs Actual (R² = {r2:.4f})')
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.3, s=1)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 13. Save Results for Model Comparison

In [None]:
# Save metrics to file for comparison notebook
results = {
    'model': 'AWS SageMaker Linear Learner',
    'mse': mse,
    'rmse': rmse,
    'mae': mae,
    'r2': r2,
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'features': X_train.shape[1],
    'endpoint_name': predictor.endpoint_name
}

if target == 'log_price':
    results['actual_price_rmse'] = actual_rmse

# Save to file
import json
with open('aws_sagemaker_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to aws_sagemaker_results.json")
print("\nResults:")
print(json.dumps(results, indent=2))

## 14. ⚠️ IMPORTANT: Clean Up Resources

**Delete the endpoint to stop incurring charges!**

In [None]:
# Delete the endpoint
print(f"Deleting endpoint: {predictor.endpoint_name}")
predictor.delete_endpoint()
print("✅ Endpoint deleted successfully!")
print("\n⚠️ Remember to also stop your notebook instance in the SageMaker console!")

## Summary

This notebook:
1. ✅ Loaded and prepared UK Housing data
2. ✅ Uploaded data to S3
3. ✅ Trained AWS SageMaker Linear Learner model
4. ✅ Deployed model to endpoint
5. ✅ Made predictions and evaluated performance
6. ✅ Saved results for comparison
7. ✅ Cleaned up resources

**Next Steps:**
- Download this notebook from SageMaker
- Save it to your GitHub repo as `09_AWS_SageMaker_Model.ipynb`
- Use the results in your model comparison notebook
- Remember to stop your SageMaker notebook instance!