In [57]:
import pandas as pd
import os
import sys
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,ElasticNet, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA



In [2]:
notebook_dir = os.getcwd()


In [3]:
pd.set_option('display.max_columns', 500)

os.chdir(notebook_dir)
os.chdir('..')
df1 = pd.read_csv("data/Austin_For_Sale_CLEANED.csv")
df2 = pd.read_csv("data/Austin_Sold_CLEANED.csv")

In [4]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [5]:
df2.head()

Unnamed: 0,property_url,mls,mls_id,status,text,style,full_street_line,street,unit,city,state,zip_code,beds,full_baths,half_baths,sqft,year_built,days_on_mls,list_price,list_date,last_sold_date,assessed_value,estimated_value,lot_sqft,price_per_sqft,latitude,longitude,neighborhoods,fips_code,stories,hoa_fee,parking_garage,nearby_schools,primary_photo,alt_photos,geometry,days_on_market
0,https://www.realtor.com/realestateandhomes-det...,CTTX,537428,SOLD,fabulous single story home near Domain and eas...,SINGLE_FAMILY,12803 Irongate Ave,12803 Irongate Ave,,Austin,TX,78727,3.0,2.0,0.0,1387.0,1977.0,75.0,375000.0,2024-03-16,2024-05-30,458786.0,375005.0,8246.0,,30.42124,-97.694559,"Gracy Woods, Lamplight Village, Northwest Austin",48453.0,1.0,0.0,2.0,"Pflugerville Independent School District, Chap...",http://ap.rdcpix.com/afe0492e10a406866f05db5ce...,http://ap.rdcpix.com/afe0492e10a406866f05db5ce...,POINT (-97.694559 30.42124),75
1,https://www.realtor.com/realestateandhomes-det...,CTTX,540753,SOLD,HERE'S AN OPPORTUNITY YOU DON'T WANT TO MISS.....,SINGLE_FAMILY,1403 Minnie Dr,1403 Minnie Dr,,Austin,TX,78732,1.0,1.0,0.0,704.0,1982.0,34.0,440000.0,2024-04-18,2024-05-22,388865.0,437000.0,12990.0,,30.330755,-97.926471,Northwest Austin,48453.0,1.0,0.0,0.0,Leander Independent School District,http://ap.rdcpix.com/99d2b8808ea4b547b35a52da1...,http://ap.rdcpix.com/99d2b8808ea4b547b35a52da1...,POINT (-97.926471 30.330755),34
2,https://www.realtor.com/realestateandhomes-det...,CTTX,534869,SOLD,PRICE IMPROVEMENT! Enjoy living in the heart o...,SINGLE_FAMILY,6004 Open Range Trl,6004 Open Range Trl,,Austin,TX,78749,4.0,2.0,0.0,1886.0,1984.0,112.0,497000.0,2024-02-23,2024-06-14,367134.0,497731.0,11543.0,,30.221694,-97.86717,"Village at Western Oaks, Southwest Austin",48453.0,1.0,0.0,2.0,"Trinity Charter School, Austin Independent Sch...",http://ap.rdcpix.com/03a53e9c364a418ea8dab4367...,http://ap.rdcpix.com/03a53e9c364a418ea8dab4367...,POINT (-97.86717 30.221694),112
3,https://www.realtor.com/realestateandhomes-det...,HLTX,168089,SOLD,Amazing Find In Cuernavaca w/ Hot Tub & Pool. ...,SINGLE_FAMILY,2814 Saratoga Dr,2814 Saratoga Dr,,Austin,TX,78733,3.0,2.0,0.0,1920.0,1982.0,95.0,950000.0,2024-03-27,2024-06-30,677461.0,940000.0,15682.0,,30.350446,-97.861136,"West Austin, Austin Lake Estates",48453.0,1.0,0.0,0.0,Eanes Independent School District,http://ap.rdcpix.com/2ec12f0dd45b5eadf66e3759b...,http://ap.rdcpix.com/2ec12f0dd45b5eadf66e3759b...,POINT (-97.861136 30.350446),95
4,https://www.realtor.com/realestateandhomes-det...,CTTX,538228,SOLD,Welcome to this beautiful home nestled in the ...,SINGLE_FAMILY,7600 Yaupon Dr,7600 Yaupon Dr,,Austin,TX,78759,4.0,2.0,1.0,2901.0,1992.0,92.0,955000.0,2024-03-24,2024-06-24,716925.0,956108.0,10415.0,,30.4142,-97.778989,"Jollyville, Northwest Austin, Great Hills",48453.0,2.0,4.0,2.0,"Round Rock Independent School District, Harmon...",http://ap.rdcpix.com/3ab570c6480e6c091549c8e35...,http://ap.rdcpix.com/3ab570c6480e6c091549c8e35...,POINT (-97.778989 30.4142),92


In [6]:
#REMOVE OUTLIERS AND NORMALIZE VARIABLES

for column in ['beds','full_baths', 'half_baths' ,'sqft','days_on_market','lot_sqft','hoa_fee','parking_garage','list_price']:
    df2 = remove_outliers_iqr(df2, column)
scaler = StandardScaler()
df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']] = scaler.fit_transform(df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']])


# OLS model testing for LIST PRICE

In [7]:
#Assign X, y
X = df2[['beds','full_baths', 'half_baths' ,'sqft','days_on_market','lot_sqft','hoa_fee','parking_garage']]
X = sm.add_constant(X)
y = df2['list_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#create OLS model testing for LIST PRICE
model = sm.OLS(y_train, X_train).fit()

# Predict on the testing set
y_pred = model.predict(X_test)


In [8]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 0.7358682482000477
Mean Squared Error: 0.745883651081248
R-squared: 0.28458181942368854


In [9]:
# Summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             list_price   R-squared:                       0.262
Model:                            OLS   Adj. R-squared:                  0.242
Method:                 Least Squares   F-statistic:                     13.31
Date:                Sun, 14 Jul 2024   Prob (F-statistic):           6.76e-13
Time:                        17:15:12   Log-Likelihood:                -292.68
No. Observations:                 232   AIC:                             599.4
Df Residuals:                     225   BIC:                             623.5
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.0658      0.109      0.

  return np.sqrt(eigvals[0]/eigvals[-1])


# OLS model testing for DAYS ON MARKET

In [10]:
#Assign X, y
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
X = sm.add_constant(X)
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

#create OLS model testing for DAYS ON MARKET
model = sm.OLS(y_train, X_train).fit()

# Predict on the testing set
y_pred = model.predict(X_test)

In [11]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 43.51207903429464
Mean Squared Error: 2534.3295983765456
R-squared: -0.13188372160395878


In [12]:
# Summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         days_on_market   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                 -0.016
Method:                 Least Squares   F-statistic:                    0.4029
Date:                Sun, 14 Jul 2024   Prob (F-statistic):              0.877
Time:                        17:15:12   Log-Likelihood:                -1252.2
No. Observations:                 232   AIC:                             2518.
Df Residuals:                     225   BIC:                             2543.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             87.5895      3.573     24.

  return np.sqrt(eigvals[0]/eigvals[-1])


# LASSO model testing for DAYS ON MARKET

In [14]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Perform Grid Search to find the best alpha for Lasso
lasso = Lasso()
params = {'alpha': np.logspace(-4, 0, 50)}
grid_search = GridSearchCV(lasso, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-03, 3....
       9.10298178e-03, 1.09854114e-02, 1.32571137e-02, 1.59985872e-02,
       1.93069773e-02, 2.32995181e-02, 2.81176870e-02, 3.39322177e-02,
       4.09491506e-02, 4.94171336e-02, 5.96362332e-02, 7.19685673e-02,
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00])},
             scoring='r2')

In [17]:
# Best model
best_lasso = grid_search.best_estimator_

# Fit the best Lasso model
best_lasso.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_lasso.predict(X_test_scaled)

In [18]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_lasso.coef_}')
print(f'Intercept: {best_lasso.intercept_}')

Best alpha: 1.0
Mean Absolute Error: 47.32893455836736
Mean Squared Error: 3227.059710171491
R-squared: -0.024485859421502942
Coefficients: [-0.          0.         -1.38989752  3.0179572  -0.          2.95169296
  0.         -1.72848649]
Intercept: 84.97931034482758


In [19]:
# Feature selection
selected_features = X.columns[best_lasso.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['half_baths', 'sqft', 'hoa_fee', 'list_price'], dtype='object')


# ELASTIC NET model testing for DAYS ON MARKET

In [22]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [23]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Perform Grid Search to find the best alpha and l1_ratio for Elastic Net
elastic_net = ElasticNet()
params = {
    'alpha': np.logspace(-4, 0, 50),
    'l1_ratio': np.linspace(0, 1, 50)
}
grid_search = GridSearchCV(elastic_net, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-...
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])},
             scoring='r2')

In [25]:
# Best model
best_elastic_net = grid_search.best_estimator_

# Fit the best Elastic Net model
best_elastic_net.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_elastic_net.predict(X_test_scaled)

  model = cd_fast.enet_coordinate_descent(


In [26]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Best l1_ratio: {grid_search.best_params_["l1_ratio"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_elastic_net.coef_}')
print(f'Intercept: {best_elastic_net.intercept_}')

Best alpha: 1.0
Best l1_ratio: 0.0
Mean Absolute Error: 47.10825261071312
Mean Squared Error: 3201.0076977778713
R-squared: -0.01621519798235682
Coefficients: [-0.13571387  0.         -0.87950263  1.96093107 -0.34179147  2.09002681
  0.         -1.15148432]
Intercept: 84.97931034482758


In [27]:
# Feature selection
selected_features = X.columns[best_elastic_net.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee', 'list_price'], dtype='object')


# RIDGE REGRESSION model testing for DAYS ON MARKET


In [29]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [30]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Perform Grid Search to find the best alpha for Ridge
ridge = Ridge()
params = {'alpha': np.logspace(-4, 4, 50)}
grid_search = GridSearchCV(ridge, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.09491506e-02, 5.96362332e-02, 8.68511374e-02, 1....
       8.28642773e-01, 1.20679264e+00, 1.75751062e+00, 2.55954792e+00,
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04])},
             scoring='r2')

In [32]:
# Best model
best_ridge = grid_search.best_estimator_

# Fit the best Ridge model
best_ridge.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_ridge.predict(X_test_scaled)

In [33]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_ridge.coef_}')
print(f'Intercept: {best_ridge.intercept_}')

Best alpha: 10000.0
Mean Absolute Error: 46.775014773773115
Mean Squared Error: 3152.761978580352
R-squared: -0.0008987608740853581
Coefficients: [ 0.00629084  0.         -0.00640322  0.05083303 -0.01361824  0.06815332
  0.         -0.02643446]
Intercept: 84.97931034482758


In [34]:
# Feature selection
selected_features = X.columns[best_ridge.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee', 'list_price'], dtype='object')


# SVR LINEAR model testing for DAYS ON MARKET


In [36]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [37]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
# Perform Grid Search to find the best parameters for SVR
svr = SVR(kernel='linear')
params = {'C': np.logspace(-4, 4, 10)}
grid_search = GridSearchCV(svr, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=SVR(kernel='linear'),
             param_grid={'C': array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04])},
             scoring='r2')

In [39]:
# Best model
best_svr = grid_search.best_estimator_

# Fit the best SVR model
best_svr.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_svr.predict(X_test_scaled)

In [40]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best C: {grid_search.best_params_["C"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_svr.coef_}')
print(f'Intercept: {best_svr.intercept_}')

Best C: 0.0001
Mean Absolute Error: 45.63033941292769
Mean Squared Error: 3354.067529307538
R-squared: -0.06480668594067795
Coefficients: [[ 8.42973392e-04  0.00000000e+00 -4.06575815e-19  7.37709925e-04
  -7.51542512e-04  5.08407272e-04  0.00000000e+00 -1.17976573e-03]]
Intercept: [72.00056149]


In [41]:
# Feature selection
selected_features = X.columns[best_svr.coef_[0] != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee', 'list_price'], dtype='object')


# SVR RBF model testing for DAYS ON MARKET


In [43]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [44]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
# Perform Grid Search to find the best parameters for SVR with RBF kernel
svr = SVR(kernel='rbf')
params = {
    'C': np.logspace(-2, 2, 5),
    'gamma': np.logspace(-2, 2, 5)
}
grid_search = GridSearchCV(svr, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'gamma': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
             scoring='r2')

In [46]:

# Best model
best_svr = grid_search.best_estimator_

# Fit the best SVR model
best_svr.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_svr.predict(X_test_scaled)

In [47]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Best parameters: {'C': 100.0, 'gamma': 100.0}
Mean Absolute Error: 46.67502309999193
Mean Squared Error: 3142.979850522068
R-squared: 0.0022067446854106


# RANDOM FOREST ENSEMBLE REGRESSOR model testing for DAYS ON MARKET


In [49]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [50]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [51]:
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=0)

In [52]:
# Perform Grid Search to find the best parameters for Random Forest
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [None, 10, 20, 30],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='r2')

In [53]:
# Best model
best_rf = grid_search.best_estimator_

# Fit the best Random Forest model
best_rf.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test_scaled)

In [54]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Mean Absolute Error: 47.85726443777907
Mean Squared Error: 3335.345272270283
R-squared: -0.058862981976767426


In [56]:
# Feature importance
feature_importance = pd.Series(best_rf.feature_importances_, index=X.columns)
print(f'Feature Importance:\n{feature_importance.sort_values(ascending=False)}')

Feature Importance:
list_price        0.351938
lot_sqft          0.232800
sqft              0.208279
hoa_fee           0.179302
beds              0.016499
half_baths        0.011182
full_baths        0.000000
parking_garage    0.000000
dtype: float64


# PRINCIPLE COMPONENT ANALYSIS (PCA) model testing for DAYS ON MARKET

In [63]:
df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']]

Unnamed: 0,beds,full_baths,half_baths,sqft,lot_sqft,hoa_fee,parking_garage,list_price
0,-0.686202,0.0,-0.718070,-1.000278,0.177283,-0.860377,0.0,-0.760092
2,1.372405,0.0,-0.718070,0.167519,1.655616,-0.860377,0.0,0.260582
5,-0.686202,0.0,-0.718070,-1.107930,-1.762442,-0.860377,0.0,0.026329
12,1.372405,0.0,1.392621,0.169859,0.628360,-0.860377,0.0,1.037799
14,1.372405,0.0,-0.718070,-0.029064,1.479848,-0.860377,0.0,-0.341783
...,...,...,...,...,...,...,...,...
672,-0.686202,0.0,-0.718070,0.151137,-0.199811,1.670085,0.0,-0.555120
673,1.372405,0.0,1.392621,2.634166,1.134141,0.210203,0.0,1.105565
675,-0.686202,0.0,-0.718070,-0.918368,0.210463,-0.860377,0.0,-1.346560
677,1.372405,0.0,1.392621,1.620828,-0.508302,0.988807,0.0,-0.342620


In [64]:
# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']])
# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
principal_components = pca.fit_transform(df_scaled)


In [65]:
# Create a DataFrame with the principal components
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

In [66]:
print(df_pca.head())
print('Explained Variance Ratio:', pca.explained_variance_ratio_)
print('Singular Values:', pca.singular_values_)

        PC1       PC2
0 -1.729063  0.356675
1  0.462748  1.946182
2 -1.719619 -0.698465
3  1.570950  0.828130
4  0.103068  1.550337
Explained Variance Ratio: [0.35759134 0.23003922]
Singular Values: [24.98708636 20.04116973]
