### Modeling real estate prices

In [1]:
#Import Required Libraries 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import pandas as pd
import project_libs as libs
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
#load Data
data = libs.read_csv_to_dataframe("data/data.csv")
df = data.copy()

shape (4600, 18)
----------------------------------------------------------------------------------------------------
List of columns
['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']
----------------------------------------------------------------------------------------------------
Data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   i

In [3]:
#view top 5
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


For our modeling purposes, we will drop the 'date' and 'country' columns because the data is from the same year and country.

In [4]:
#drop unnecessary columns
df.drop(columns=['date','country'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          4600 non-null   float64
 1   bedrooms       4600 non-null   float64
 2   bathrooms      4600 non-null   float64
 3   sqft_living    4600 non-null   int64  
 4   sqft_lot       4600 non-null   int64  
 5   floors         4600 non-null   float64
 6   waterfront     4600 non-null   int64  
 7   view           4600 non-null   int64  
 8   condition      4600 non-null   int64  
 9   sqft_above     4600 non-null   int64  
 10  sqft_basement  4600 non-null   int64  
 11  yr_built       4600 non-null   int64  
 12  yr_renovated   4600 non-null   int64  
 13  street         4600 non-null   object 
 14  city           4600 non-null   object 
 15  statezip       4600 non-null   object 
dtypes: float64(4), int64(9), object(3)
memory usage: 575.1+ KB


Street, city, and zip code state are categorical columns that must be labeled as numbers using a label encoder.
Price is a target column

In [5]:
#Features to label
features = ['street','city','statezip']

#Target column 
target_column = 'price'

List models to run in the pipeline function for the selection of the best model

In [6]:
# Define the list of models to run in the pipeline
models = [
    ('Random Forest', RandomForestRegressor, {'n_estimators': 100}),
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100}),
    ('Ridge Regression', Ridge, {'alpha': 1.0}),
    ('Lasso Regression', Lasso, {'alpha': 1.0}),
    ('ElasticNet Regression', ElasticNet, {'alpha': 1.0, 'l1_ratio': 0.5}),
    ('Decision Tree', DecisionTreeRegressor, {'max_depth': 5}),
    ('Extra Trees', ExtraTreesRegressor, {'n_estimators': 100}),
    ('KNN', KNeighborsRegressor, {'n_neighbors': 5}),
    ('Gaussian Process', GaussianProcessRegressor, {'n_restarts_optimizer': 10, 'alpha': 0.1}),
    
]


Input all required parameters and execute the pipeline function to obtain the best model.

In [7]:
#running pipeline function
X_train, X_test, y_train, y_test,result = libs.run_ml_pipeline(df, target_column, features, models, encoding_method='label',  use_cross_validation=False)

____________________________________________________________________________________________________
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
dtype: int64
No missing values found. No imputation needed.
____________________________________________________________________________________________________
____________________________________________________________________________________________________
Data Frame shape (4600, 16)
((3680, 15), (920, 15), (3680,), (920,))
____________________________________________________________________________________________________
Model: Random Forest
Training Score: 0.94
Test Score: 0.04
----------------------------------------
Model: Gradient Boosting
Training Score: 0.81
Test Score: 0.05


  model = cd_fast.enet_coordinate_descent(


Model: Extra Trees
Training Score: 1.00
Test Score: 0.04
----------------------------------------
Model: KNN
Training Score: 0.66
Test Score: 0.03
----------------------------------------
Model: Gaussian Process
Training Score: 0.98
Test Score: -0.05
----------------------------------------


View the pipeline results and select the best model based on train and test scores

In [8]:
#pipeline result
result.sort_values(by = 'Test Score',ascending= False)

Unnamed: 0,Model Name,Training Score,Test Score
1,Gradient Boosting,0.814809,0.050579
0,Random Forest,0.944838,0.043093
6,Extra Trees,0.999996,0.038737
4,ElasticNet Regression,0.512828,0.038632
2,Ridge Regression,0.549346,0.032914
3,Lasso Regression,0.549346,0.032902
7,KNN,0.663676,0.032638
5,Decision Tree,0.639415,0.002634
8,Gaussian Process,0.976093,-0.052371


From the pipeline results, Gradient Boosting is the best model. Using the best model,
train with the best parameters by applying hyperparameter tuning 

In [9]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [5, 15, 10], # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5, 7], # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],# Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'subsample': [0.8, 1.0],# Subsample ratio of the training instance
    'max_features': [10, 'sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']  # Loss function to be optimized
}


best_model, best_params, best_score = libs.hyperparameter_tuning(model, param_grid, X_train, y_train, 'r2')

Train and Predict with the tuned model

In [10]:
# Train the tuned model
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy score
R2Score = r2_score(y_test, y_pred)

print(R2Score)

0.043614916733629716
