In [None]:
# ## 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import joblib

In [None]:
# ## 2. Load and Preprocess Data
#
# Preprocessing steps:
# - Convert `'date'` column to datetime and extract sale year, month, and day.
# - Create new features: `age` (sale_year - yr_built), `was_renovated` (1 if yr_renovated > 0), and `was_viewed` (1 if view > 0).
# - Drop columns not required: `'id'`, `'date'`, `'yr_built'`, `'yr_renovated'`, `'view'`, `'sale_year'`, `'sale_month'`, and `'sale_day'`.

# %%
# Load the dataset
data = pd.read_csv('kc_house_data.csv')

# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Extract sale year, month, and day
data['sale_year'] = data['date'].dt.year
data['sale_month'] = data['date'].dt.month
data['sale_day'] = data['date'].dt.day

# Create new features:
# Age of the house = sale_year - yr_built
data['age'] = data['sale_year'] - data['yr_built']

# Binary feature for renovation: was_renovated = 1 if yr_renovated > 0 else 0
data['was_renovated'] = data['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

# Binary feature for view: was_viewed = 1 if view > 0 else 0
data['was_viewed'] = data['view'].apply(lambda x: 1 if x > 0 else 0)

# Drop unused columns
data.drop(columns=['id', 'date', 'yr_built', 'yr_renovated', 'view',
                   'sale_year', 'sale_month', 'sale_day'], inplace=True)

# Verify preprocessing
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,zipcode,lat,long,sqft_living15,sqft_lot15,age,was_renovated,was_viewed
0,221900.0,3,1.0,1180,5650,1.0,0,3,7,1180,0,98178,47.5112,-122.257,1340,5650,59,0,0
1,538000.0,3,2.25,2570,7242,2.0,0,3,7,2170,400,98125,47.721,-122.319,1690,7639,63,1,0
2,180000.0,2,1.0,770,10000,1.0,0,3,6,770,0,98028,47.7379,-122.233,2720,8062,82,0,0
3,604000.0,4,3.0,1960,5000,1.0,0,5,7,1050,910,98136,47.5208,-122.393,1360,5000,49,0,0
4,510000.0,3,2.0,1680,8080,1.0,0,3,8,1680,0,98074,47.6168,-122.045,1800,7503,28,0,0


In [None]:
# ## 3. Define Features and Target, and Split the Data
#
# Features (in the same order as the article):
#
# - `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`, `floors`, `waterfront`,
#   `condition`, `grade`, `sqft_above`, `sqft_basement`, `age`, `was_renovated`,
#   `was_viewed`, `lat`, `long`, `zipcode`
#
# The target is `price`.

# %%
features_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
                 'condition', 'grade', 'sqft_above', 'sqft_basement', 'age', 'was_renovated',
                 'was_viewed', 'lat', 'long', 'zipcode']
X = data[features_list]
y = data['price']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# ## 4. Scale the Features
#
# First, apply StandardScaler, then MinMaxScaler to normalize features between 0 and 1.

# %%
# Standardize features
standard_scaler = StandardScaler()
X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

# Normalize standardized features with MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train_scaled = minmax_scaler.fit_transform(X_train_standard)
X_test_scaled = minmax_scaler.transform(X_test_standard)

In [None]:
# ## 5. Train the Models

# ### 5.1 Train Random Forest Model

# %%
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
rf_r2 = r2_score(y_test, rf_pred)
print("Random Forest R² Score:", rf_r2)

Random Forest R² Score: 0.8462445751404022


In [None]:
# ### 5.2 Train XGBoost Model

# %%
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)
xgb_r2 = r2_score(y_test, xgb_pred)
print("XGBoost R² Score:", xgb_r2)

XGBoost R² Score: 0.8575302831515524


In [None]:
# ### 5.3 Train svr Model

param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1],
    'gamma': ['scale', 'auto']
}

svr_target_scaler = StandardScaler()

y_train_svr = svr_target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()

grid_search = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train_svr)

best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test_scaled)

y_pred_original = svr_target_scaler.inverse_transform(y_pred.reshape(-1, 1)).ravel()
y_test_original = y_test.values  # already on the original scale
svr_r2 = r2_score(y_test_original, y_pred_original)
print("SVR R² Score on Test Set (Original Scale):", svr_r2)

SVR R² Score on Test Set (Original Scale): 0.8449886718281002


In [None]:
# ### 5.4 Train ANN Model
#
# Since the target values (house prices) are large, we scale **y** for ANN training using StandardScaler.
# After training, we inverse-transform predictions for evaluation.
#
# The ANN architecture (as described in the article):
# - Input layer, three hidden layers, and one output layer.
# - Hidden layers use ReLU activation.
# - Output layer uses linear activation.
# - Compiled with Adam optimizer and mean squared error loss.
# - Trained for 50 epochs with batch size of 32.

# %%
# Scale the target for ANN training
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

input_dim = X_train_scaled.shape[1]
ann_model = Sequential()
ann_model.add(Dense(64, input_dim=input_dim, activation='relu'))
ann_model.add(Dense(32, activation='relu'))
ann_model.add(Dense(16, activation='relu'))
ann_model.add(Dense(1, activation='linear'))

ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
history = ann_model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32, verbose=1)

# Evaluate ANN on test data (inverse-transform predictions)
ann_pred_scaled = ann_model.predict(X_test_scaled)
ann_pred = y_scaler.inverse_transform(ann_pred_scaled)
ann_r2 = r2_score(y_test, ann_pred)
print("ANN R² Score:", ann_r2)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.5258
Epoch 2/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 0.2400
Epoch 3/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2136
Epoch 4/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1848
Epoch 5/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1728
Epoch 6/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1676
Epoch 7/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.1529
Epoch 8/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1567
Epoch 9/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1571
Epoch 10/100
[1m541/541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss

In [None]:
# ## 6. Model Performance Summary
#
# The article reported these tuned R² scores:
# - **Random Forest:** ~0.855
# - **XGBoost:** ~0.886
# - **SVR:** ~0.849
# - **ANN:** ~0.887
#
# We print the computed R² scores below.

# %%
print("\nModel R² Scores on Test Set:")
print("Random Forest: {:.3f}".format(rf_r2))
print("XGBoost: {:.3f}".format(xgb_r2))
print("SVR: {:.3f}".format(svr_r2))
print("ANN: {:.3f}".format(ann_r2))


Model R² Scores on Test Set:
Random Forest: 0.846
XGBoost: 0.858
SVR: 0.845
ANN: 0.889


In [None]:
# ## 7. Sample Prediction
#
# Suppose we have the following house features:
#
# - **Bedrooms:** 3
# - **Bathrooms:** 2
# - **Square Foot Living:** 1800
# - **Square Foot Lot:** 5000
# - **Floors:** 1
# - **Waterfront:** 0
# - **Condition:** 3
# - **Grade:** 7
# - **Square Foot Above:** 1800
# - **Square Foot Basement:** 0
# - **Age:** 40
# - **Was Renovated:** 0
# - **Was Viewed:** 1
# - **Latitude:** 47.5112
# - **Longitude:** -122.257
# - **Zipcode:** 98052
#
# We preprocess these inputs and obtain predictions from each model.
#
# For the ANN, we inverse-transform the prediction to obtain the original price scale.

# %%
# Define a sample input (features must be in the same order as training)
sample_input = np.array([3, 2, 1800, 5000, 1, 0, 3, 7, 1800, 0, 40, 0, 1, 47.5112, -122.257, 98052]).reshape(1, -1)

# Preprocess the sample input using the same scalers
sample_standard = standard_scaler.transform(sample_input)
sample_scaled = minmax_scaler.transform(sample_standard)

# Generate predictions from each model
sample_rf_pred = rf_model.predict(sample_scaled)[0]
sample_xgb_pred = xgb_model.predict(sample_scaled)[0]

sample_svr_pred_scaled = best_svr.predict(sample_scaled)

# Inverse transform the scaled prediction to get the original price
sample_svr_pred = svr_target_scaler.inverse_transform(sample_svr_pred_scaled.reshape(-1, 1)).ravel()[0]

#sample_svr_pred = best_svr.predict(sample_scaled)[0]

# For ANN, scale input and inverse transform the output
sample_ann_pred_scaled = ann_model.predict(sample_scaled)
sample_ann_pred = y_scaler.inverse_transform(sample_ann_pred_scaled)[0][0]

print("Sample House Price Predictions:")
print("Random Forest Prediction: ${:.2f}".format(sample_rf_pred))
print("XGBoost Prediction: ${:.2f}".format(sample_xgb_pred))
print("SVR Prediction: ${:.2f}".format(sample_svr_pred))
print("ANN Prediction: ${:.2f}".format(sample_ann_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Sample House Price Predictions:
Random Forest Prediction: $344038.13
XGBoost Prediction: $382937.47
SVR Prediction: $488158.09
ANN Prediction: $479894.88




In [None]:
ann_model.save('best_ann.h5')
joblib.dump(y_scaler, 'y_scaler.joblib')

joblib.dump(standard_scaler, 'feature_scaler.joblib')



['feature_scaler.joblib']

In [None]:
loaded_model = load_model('best_ann.h5')

# Load the target scaler
y_scaler = joblib.load('y_scaler.joblib')

# Load the feature scaler (if saved)
feature_scaler = joblib.load('feature_scaler.joblib')  # Adjust if applicable



In [None]:
sample_input = np.array([3, 2, 1800, 5000, 1, 0, 3, 7, 1800, 0, 40, 0, 1, 47.5112, -122.257, 98052]).reshape(1, -1)

# Preprocess the sample input using the same scalers
sample_standard = standard_scaler.transform(sample_input)
sample_scaled = minmax_scaler.transform(sample_standard)

sample_ann_pred_scaled = loaded_model.predict(sample_scaled)
sample_ann_pred = y_scaler.inverse_transform(sample_ann_pred_scaled)[0][0]

print("ANN Prediction: ${:.2f}".format(sample_ann_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
ANN Prediction: $479894.88
