# Ames Housing Dataset

## About Dataset
The Ames Housing Dataset is a well-known dataset in the field of machine learning and data analysis. It contains various features and attributes of residential homes in Ames, Iowa, USA. The dataset is often used for regression tasks, particularly for predicting housing prices.

### Key Details:
- **Number of Instances**: The dataset consists of 2,930 instances or observations.
- **Number of Features**: There are 79 different features or variables that describe various aspects of the residential properties.
- **Target Variable**: The target variable in the dataset is `SalePrice`, representing the sale price of the houses.
- **Data Types**:  
  - The features include both numerical and categorical variables.
  - They cover a wide range of aspects such as lot size, number of rooms, location, construction quality, and more.

### Applications:
The Ames Housing Dataset is widely used in the machine learning community for:
- **Regression Modeling**: Predicting house prices based on property features.
- **Feature Engineering**: Developing and testing new techniques to handle numerical and categorical data.
- **Predictive Analytics**: Analyzing and forecasting trends in the real estate domain.

This dataset serves as a valuable resource for exploring and applying machine learning algorithms to real-world problems related to housing prices.


# STEP 1: IMPORT LIBRARIES AND SETUP

In [1]:
import os

# Data Handling & Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical Functions
from scipy.stats import skew

# Sklearn Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# AWS & SageMaker Libraries
import boto3
import sagemaker
from sagemaker import Session, get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.amazon.linear_learner import LinearLearner
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# STEP 2: LOAD THE DATA

In [2]:
def load_data(filepath):
    """
    Read the CSV file into a pandas DataFrame.
    """
    return pd.read_csv(filepath)

file_path = "AmesHousing.csv"
target_col = "SalePrice"

df = load_data(file_path)
print("Data loaded successfully!\n")
print(df.head())
print("\nShape of the dataset:", df.shape)
df.info()

Data loaded successfully!

   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL         141.0     31770   Pave   
1      2  526350040           20        RH          80.0     11622   Pave   
2      3  526351010           20        RL          81.0     14267   Pave   
3      4  526353030           20        RL          93.0     11160   Pave   
4      5  527105010           60        RL          74.0     13830   Pave   

  Alley Lot Shape Land Contour  ... Pool Area Pool QC  Fence Misc Feature  \
0   NaN       IR1          Lvl  ...         0     NaN    NaN          NaN   
1   NaN       Reg          Lvl  ...         0     NaN  MnPrv          NaN   
2   NaN       IR1          Lvl  ...         0     NaN    NaN         Gar2   
3   NaN       Reg          Lvl  ...         0     NaN    NaN          NaN   
4   NaN       IR1          Lvl  ...         0     NaN  MnPrv          NaN   

  Misc Val Mo Sold Yr Sold Sale Type  Sale Cond

# STEP 3: PREPROCESS THE DATA

## Drop Columns with Too Many Missing Values

In [3]:
def drop_high_missing_columns(dataframe, threshold=0.3):
    """
    Drop columns where the fraction of missing values exceeds the given threshold.
    """
    cols_to_drop = dataframe.columns[dataframe.isnull().mean() > threshold]
    print(f"Dropping columns with more than {threshold*100}% missing values: {list(cols_to_drop)}")
    return dataframe.drop(columns=cols_to_drop)

df = drop_high_missing_columns(df)

Dropping columns with more than 30.0% missing values: ['Alley', 'Fireplace Qu', 'Pool QC', 'Fence', 'Misc Feature']


## Drop Rows with Missing Target

In [4]:
def drop_missing_target(dataframe, target_column):
    """
    Drop rows where the target variable is missing.
    """
    if target_column not in dataframe.columns:
        print(f"Target column '{target_column}' not found.")
        return dataframe
    before = len(dataframe)
    dataframe = dataframe.dropna(subset=[target_column])
    after = len(dataframe)
    print(f"Dropped {before - after} rows with missing '{target_column}'.")
    return dataframe

df = drop_missing_target(df, target_col)

Dropped 0 rows with missing 'SalePrice'.


## Fill Remaining Missing Values

In [5]:
def fill_missing_values(dataframe):
    """
    Fill missing values for numerical columns with median 
    and fill categorical columns with 'Missing'.
    """
    df_copy = dataframe.copy()
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    categorical_cols = df_copy.select_dtypes(exclude=[np.number]).columns

    # Fill numeric columns with median
    df_copy[numeric_cols] = df_copy[numeric_cols].fillna(df_copy[numeric_cols].median())
    # Fill categorical columns with 'Missing'
    df_copy[categorical_cols] = df_copy[categorical_cols].fillna("Missing")
    
    return df_copy

df = fill_missing_values(df)

## Remove Outliers

In [6]:
def remove_outliers(dataframe, col_name, upper_limit):
    """
    Remove rows from the dataframe where the specified 
    column exceeds the upper_limit value.
    """
    if col_name not in dataframe.columns:
        print(f"Column '{col_name}' not found. Skipping outlier removal.")
        return dataframe
    before = len(dataframe)
    dataframe = dataframe[dataframe[col_name] < upper_limit]
    after = len(dataframe)
    print(f"Removed {before - after} outliers from '{col_name}'.")
    return dataframe

df = remove_outliers(df, col_name="Gr Liv Area", upper_limit=4000)

Removed 5 outliers from 'Gr Liv Area'.


## Encode Categorical Variables

In [7]:
def encode_categorical_features(dataframe, freq_threshold=10):
    """
    Encode categorical variables using one-hot encoding if they have 
    less than or equal to freq_threshold unique categories, 
    otherwise apply frequency encoding.
    """
    df_copy = dataframe.copy()
    cat_cols = df_copy.select_dtypes(include=["object"]).columns
    one_hot_frames = []
    freq_frames = {}

    for col in cat_cols:
        unique_count = df_copy[col].nunique()
        if unique_count > freq_threshold:
            freq_map = df_copy[col].value_counts(normalize=True)
            freq_frames[col + "_freq"] = df_copy[col].map(freq_map)
        else:
            one_hot_frames.append(pd.get_dummies(df_copy[col], prefix=col, drop_first=True))

    # Merge frequency encoded columns
    if freq_frames:
        df_copy = df_copy.join(pd.DataFrame(freq_frames, index=df_copy.index))
    # Merge one-hot encoded columns
    if one_hot_frames:
        df_copy = df_copy.join(pd.concat(one_hot_frames, axis=1))

    # Drop original categorical columns
    df_copy = df_copy.drop(columns=cat_cols)
    return df_copy

df = encode_categorical_features(df)

# STEP 4: SPLIT THE DATA INTO TRAIN, VALIDATION, AND TEST

In [8]:
X = df.drop(columns=[target_col])
y = df[target_col]

# First split: Train + (Val+Test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# Second split: Validation + Test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

Train set: (2047, 203), (2047,)
Validation set: (439, 203), (439,)
Test set: (439, 203), (439,)


# STEP 5: PREPARE FILES AND SET UP SAGEMAKER SESSION

In [9]:
# Combine labels and features for train/validation
train_data = pd.concat([y_train, X_train], axis=1)
validation_data = pd.concat([y_val, X_val], axis=1)

# Save locally
train_file = 'ames_train.csv'
validation_file = 'ames_validation.csv'
test_file = 'ames_test.csv'  # If you want to upload a test set as well

train_data.to_csv(train_file, index=False, header=False)
validation_data.to_csv(validation_file, index=False, header=False)
pd.concat([y_test, X_test], axis=1).to_csv(test_file, index=False, header=False)

# Initialize SageMaker Session once
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
prefix = "sagemaker/ames-housing"

print(f"Role: {role}")
print(f"Region: {region}")
print(f"Bucket: {bucket}")

# Upload training and validation data
train_uri = sagemaker_session.upload_data(path=train_file, bucket=bucket, key_prefix=prefix)
validation_uri = sagemaker_session.upload_data(path=validation_file, bucket=bucket, key_prefix=prefix)


Role: arn:aws:iam::525897591902:role/LabRole
Region: us-east-1
Bucket: sagemaker-us-east-1-525897591902


# STEP 6: TRAIN THE LINEAR LEARNER MODEL (REGRESSION)

In [10]:
container = sagemaker.image_uris.retrieve(
    framework="linear-learner",
    region=sagemaker_session.boto_region_name
)

linear_learner = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)

linear_learner.set_hyperparameters(
    feature_dim=X.shape[1],
    predictor_type='regressor',
    mini_batch_size=32,
    epochs=10
)

train_input = TrainingInput(s3_data=train_uri, content_type="text/csv")
validation_input = TrainingInput(s3_data=validation_uri, content_type="text/csv")

linear_learner.fit({'train': train_input, 'validation': validation_input})


2025-01-29 13:07:26 Starting - Starting the training job...
2025-01-29 13:07:41 Starting - Preparing the instances for training...
2025-01-29 13:08:06 Downloading - Downloading input data...
2025-01-29 13:08:41 Downloading - Downloading the training image........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/29/2025 13:10:06 INFO 139654154364736] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile': '0.5', 'loss_insensitivity':

# STEP 7: DEPLOY THE MODEL

In [None]:
session = Session()
sm_client = boto3.client("sagemaker")

EndpointConfig = "regression-linear-learner-endpoint-config"
Endpoint = "regression-linear-learner-endpoint"

# 7.1 Check if the endpoint already exists
endpoint_exists = False
try:
    sm_client.describe_endpoint(EndpointName=Endpoint)
    endpoint_exists = True
except sm_client.exceptions.ClientError as e:
    error_code = e.response["Error"]["Code"]
    error_message = e.response["Error"]["Message"]
    if (error_code == "ValidationException" and "Could not find endpoint" in error_message) \
       or ("ResourceNotFound" in error_message):
        endpoint_exists = False
    else:
        raise e

if endpoint_exists:
    delete_prompt = input(f"Endpoint '{Endpoint}' already exists. Delete it? [y/n] ")
    if delete_prompt.lower().startswith("y"):
        sm_client.delete_endpoint(EndpointName=Endpoint)
        print(f"Deleted endpoint: {Endpoint}")

# 7.2 Check if the endpoint config exists
endpoint_config_exists = False
try:
    sm_client.describe_endpoint_config(EndpointConfigName=EndpointConfig)
    endpoint_config_exists = True
except sm_client.exceptions.ClientError as e:
    error_code = e.response["Error"]["Code"]
    error_message = e.response["Error"]["Message"]
    if (error_code == "ValidationException" and "Could not find endpoint configuration" in error_message) \
       or ("ResourceNotFound" in error_message):
        endpoint_config_exists = False
    else:
        raise e

if endpoint_config_exists:
    delete_config_prompt = input(f"Endpoint config '{EndpointConfig}' already exists. Delete it? [y/n] ")
    if delete_config_prompt.lower().startswith("y"):
        sm_client.delete_endpoint_config(EndpointConfigName=EndpointConfig)
        print(f"Deleted endpoint config: {EndpointConfig}")

# 7.3 Deploy the model (once everything is cleared)
predictor = linear_learner.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=Endpoint
)

# 7.4 Configure the predictor serializer/deserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

-----

# STEP 8: EVALUATE THE DEPLOYED MODEL WITH VALIDATION SET

In [None]:
predictions = predictor.predict(X_val.values)
y_pred = [float(result["score"]) for result in predictions["predictions"]]

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"\nValidation Metrics:\nMSE: {mse}\nMAE: {mae}\nR²: {r2}")

# STEP 9: QUERY THE DEPLOYED ENDPOINT WITH TEST DATA

In [None]:
# Pick the first 5 rows from the test set
sample_data = X_test.head(5)

# Convert sample data to a NumPy array
input_data = sample_data.values

# Get predictions from the endpoint
predictions = predictor.predict(input_data)

# Extract the predicted values (scores) from the response
if "predictions" in predictions:
    y_pred_test = [float(result["score"]) for result in predictions["predictions"]]
    print("\nTest Predictions on 5 samples:", y_pred_test)
else:
    print("No 'predictions' key found in the response:", predictions)

# Compare with actual values if you want
sample_targets = y_test.loc[sample_data.index]
print("Actual:", sample_targets.tolist())

# STEP 10: DELETE THE ENDPOINT AND ENDPOINT CONFIG (OPTIONAL CLEANUP)

In [None]:
def delete_sagemaker_endpoint_and_config(sm_client, endpoint_name, endpoint_config_name):
    """
    Delete a SageMaker endpoint and its configuration if they exist.
    """
    # Delete the endpoint
    try:
        sm_client.describe_endpoint(EndpointName=endpoint_name)
        sm_client.delete_endpoint(EndpointName=endpoint_name)
        print(f"Deleted endpoint: {endpoint_name}")
    except sm_client.exceptions.ClientError as e:
        if "Could not find endpoint" in str(e) or "ResourceNotFound" in str(e):
            print(f"Endpoint '{endpoint_name}' does not exist.")
        else:
            raise e

    # Delete the endpoint config
    try:
        sm_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name)
        sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
        print(f"Deleted endpoint config: {endpoint_config_name}")
    except sm_client.exceptions.ClientError as e:
        if "Could not find endpoint configuration" in str(e) or "ResourceNotFound" in str(e):
            print(f"Endpoint config '{endpoint_config_name}' does not exist.")
        else:
            raise e

# Uncomment to clean up once you're done testing:
# delete_sagemaker_endpoint_and_config(sm_client, Endpoint, EndpointConfig)