In [1]:
import pandas as pd
import os
import sys
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,ElasticNet, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA



  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
notebook_dir = os.getcwd()


In [40]:
pd.set_option('display.max_columns', 500)

os.chdir(notebook_dir)
os.chdir('..')
df1 = pd.read_csv("data/Austin_For_Sale_CLEANED.csv")
df2 = pd.read_csv("data/Austin_Sold_CLEANED.csv")

In [31]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [42]:
#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)
# Apply one-hot encoding to the categorical columns
encoded_cat = encoder.fit_transform(df2[['Location']])


In [43]:
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(['Location']))

# Concatenate the original DataFrame (excluding the original categorical column) with the encoded DataFrame
df2 = pd.concat([df2.drop(columns=['Location']), encoded_cat_df], axis=1)

# OLS model testing for LIST PRICE

In [47]:

#Assign X, y
X = df2[['beds','full_baths', 'half_baths' ,'sqft','days_on_market','lot_sqft','hoa_fee','parking_garage','Location_ANDERSON MILL', 'Location_AVERY RANCH--LAKELINE',
       'Location_BLUFF SPRINGS', 'Location_BRODIE LANE',
       'Location_CHERRY CREEK', 'Location_CIRCLE C SOUTH',
       'Location_DECKER LAKE', 'Location_DEL VALLE EAST',
       'Location_DITTMAR--SLAUGHTER', 'Location_EAST OAK HILL',
       'Location_FRANKLIN PARK', 'Location_GARRISON PARK',
       'Location_GEORGIAN ACRES', 'Location_GRACY WOODS',
       'Location_HARRIS BRANCH', 'Location_HAYS WARTHA',
       'Location_HERITAGE HILLS', 'Location_JESTER',
       'Location_JOHNSTON TERRACE', 'Location_JOLLYVILLE',
       'Location_MANSFIELD--RIVER PLACE', 'Location_MCKINNEY',
       'Location_MCNEIL', 'Location_NORTH LAMAR',
       'Location_NORTH LAMAR RUNDBERG', 'Location_NORTH SHOAL CREEK',
       'Location_ONION CREEK', 'Location_PECAN SPRINGS-SPRINGDALE',
       'Location_POND SPRINGS', 'Location_ROGERS HILL', 'Location_Rural',
       'Location_SAMSUNG--PIONEER CROSSING', 'Location_SLAUGHTER CREEK',
       'Location_SOUTH BRODIE', 'Location_SOUTHEAST', 'Location_SWEETBRIAR',
       'Location_TECH RIDGE', 'Location_UNIVERSITY HILLS',
       'Location_VILLAGE AT WESTERN OAKS', 'Location_WEST OAK HILL',
       'Location_WESTGATE', 'Location_WINDSOR HILLS', 'Location_WINDSOR PARK',
       'Location_WOOTEN']]
X = sm.add_constant(X)
y = df2['list_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#create OLS model testing for LIST PRICE
model = sm.OLS(y_train, X_train).fit()

# Predict on the testing set
y_pred = model.predict(X_test)


In [48]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 349762.20364567346
Mean Squared Error: 336966890716.1869
R-squared: 0.5008420663910493


In [49]:
# Summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             list_price   R-squared:                       0.694
Model:                            OLS   Adj. R-squared:                  0.662
Method:                 Least Squares   F-statistic:                     21.55
Date:                Tue, 27 Aug 2024   Prob (F-statistic):           1.96e-96
Time:                        11:20:35   Log-Likelihood:                -8073.8
No. Observations:                 546   AIC:                         1.625e+04
Df Residuals:                     493   BIC:                         1.648e+04
Df Model:                          52                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

# OLS model testing for DAYS ON MARKET

In [50]:
#Assign X, y
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
X = sm.add_constant(X)
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

#create OLS model testing for DAYS ON MARKET
model = sm.OLS(y_train, X_train).fit()

# Predict on the testing set
y_pred = model.predict(X_test)

In [51]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 50.32380573466582
Mean Squared Error: 4134.090464460151
R-squared: 0.06769950997067542


In [52]:
# Summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         days_on_market   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     2.335
Date:                Tue, 27 Aug 2024   Prob (F-statistic):             0.0189
Time:                        11:24:29   Log-Likelihood:                -1985.8
No. Observations:                 341   AIC:                             3990.
Df Residuals:                     332   BIC:                             4024.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            114.8031     21.835      5.

# LASSO model testing for DAYS ON MARKET

In [53]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [54]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [55]:
# Perform Grid Search to find the best alpha for Lasso
lasso = Lasso()
params = {'alpha': np.logspace(-4, 0, 50)}
grid_search = GridSearchCV(lasso, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-03, 3....
       9.10298178e-03, 1.09854114e-02, 1.32571137e-02, 1.59985872e-02,
       1.93069773e-02, 2.32995181e-02, 2.81176870e-02, 3.39322177e-02,
       4.09491506e-02, 4.94171336e-02, 5.96362332e-02, 7.19685673e-02,
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00])},
             scoring='r2')

In [56]:
# Best model
best_lasso = grid_search.best_estimator_

# Fit the best Lasso model
best_lasso.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_lasso.predict(X_test_scaled)

In [57]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_lasso.coef_}')
print(f'Intercept: {best_lasso.intercept_}')

Best alpha: 1.0
Mean Absolute Error: 49.72767258985259
Mean Squared Error: 3971.730079290627
R-squared: 0.10431425460584487
Coefficients: [-11.69953625  23.8180892    0.           0.           7.89789369
  -0.7733619   -9.12965694  -6.45885932]
Intercept: 99.24046920821114


In [58]:
# Feature selection
selected_features = X.columns[best_lasso.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'full_baths', 'lot_sqft', 'hoa_fee', 'parking_garage',
       'list_price'],
      dtype='object')


# ELASTIC NET model testing for DAYS ON MARKET

In [59]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [60]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [61]:
# Perform Grid Search to find the best alpha and l1_ratio for Elastic Net
elastic_net = ElasticNet()
params = {
    'alpha': np.logspace(-4, 0, 50),
    'l1_ratio': np.linspace(0, 1, 50)
}
grid_search = GridSearchCV(elastic_net, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-...
       0.30612245, 0.32653061, 0.34693878, 0.36734694, 0.3877551 ,
       0.40816327, 0.42857143, 0.44897959, 0.46938776, 0.48979592,
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])},
             scoring='r2')

In [62]:
# Best model
best_elastic_net = grid_search.best_estimator_

# Fit the best Elastic Net model
best_elastic_net.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_elastic_net.predict(X_test_scaled)

  model = cd_fast.enet_coordinate_descent(


In [63]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Best l1_ratio: {grid_search.best_params_["l1_ratio"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_elastic_net.coef_}')
print(f'Intercept: {best_elastic_net.intercept_}')

Best alpha: 1.0
Best l1_ratio: 0.0
Mean Absolute Error: 49.78599889091765
Mean Squared Error: 3996.835643981021
R-squared: 0.09865256663250888
Coefficients: [-1.85921401  5.12229064 -0.74725564  1.94014968  3.61419677 -0.42586634
 -3.25543079 -0.28884404]
Intercept: 99.24046920821114


In [64]:
# Feature selection
selected_features = X.columns[best_elastic_net.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'full_baths', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee',
       'parking_garage', 'list_price'],
      dtype='object')


# RIDGE REGRESSION model testing for DAYS ON MARKET


In [65]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [66]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [67]:
# Perform Grid Search to find the best alpha for Ridge
ridge = Ridge()
params = {'alpha': np.logspace(-4, 4, 50)}
grid_search = GridSearchCV(ridge, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.09491506e-02, 5.96362332e-02, 8.68511374e-02, 1....
       8.28642773e-01, 1.20679264e+00, 1.75751062e+00, 2.55954792e+00,
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04])},
             scoring='r2')

In [68]:
# Best model
best_ridge = grid_search.best_estimator_

# Fit the best Ridge model
best_ridge.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_ridge.predict(X_test_scaled)

In [69]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best alpha: {grid_search.best_params_["alpha"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_ridge.coef_}')
print(f'Intercept: {best_ridge.intercept_}')

Best alpha: 1526.4179671752302
Mean Absolute Error: 50.24933252877379
Mean Squared Error: 4238.6915956609355
R-squared: 0.044110358568579366
Coefficients: [-0.17852855  1.64039465 -0.20959396  0.85129303  1.35807716 -0.02547285
 -0.7889674   0.31030841]
Intercept: 99.24046920821114


In [70]:
# Feature selection
selected_features = X.columns[best_ridge.coef_ != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'full_baths', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee',
       'parking_garage', 'list_price'],
      dtype='object')


# SVR LINEAR model testing for DAYS ON MARKET


In [71]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [72]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [73]:
# Perform Grid Search to find the best parameters for SVR
svr = SVR(kernel='linear')
params = {'C': np.logspace(-4, 4, 10)}
grid_search = GridSearchCV(svr, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=SVR(kernel='linear'),
             param_grid={'C': array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04])},
             scoring='r2')

In [74]:
# Best model
best_svr = grid_search.best_estimator_

# Fit the best SVR model
best_svr.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_svr.predict(X_test_scaled)

In [75]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best C: {grid_search.best_params_["C"]}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Coefficients: {best_svr.coef_}')
print(f'Intercept: {best_svr.intercept_}')

Best C: 0.0001
Mean Absolute Error: 47.520017073473745
Mean Squared Error: 4744.495897227502
R-squared: -0.06995623050666211
Coefficients: [[ 0.0023233   0.0046595   0.00090472  0.00376255  0.00434468  0.00085589
  -0.00195522  0.00218695]]
Intercept: [76.89797516]


In [76]:
# Feature selection
selected_features = X.columns[best_svr.coef_[0] != 0]
print(f'Selected Features: {selected_features}')

Selected Features: Index(['beds', 'full_baths', 'half_baths', 'sqft', 'lot_sqft', 'hoa_fee',
       'parking_garage', 'list_price'],
      dtype='object')


# SVR RBF model testing for DAYS ON MARKET


In [77]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [78]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [79]:
# Perform Grid Search to find the best parameters for SVR with RBF kernel
svr = SVR(kernel='rbf')
params = {
    'C': np.logspace(-2, 2, 5),
    'gamma': np.logspace(-2, 2, 5)
}
grid_search = GridSearchCV(svr, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'gamma': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
             scoring='r2')

In [80]:

# Best model
best_svr = grid_search.best_estimator_

# Fit the best SVR model
best_svr.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_svr.predict(X_test_scaled)

In [81]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Best parameters: {'C': 100.0, 'gamma': 100.0}
Mean Absolute Error: 48.65875937711222
Mean Squared Error: 4522.3493453302
R-squared: -0.019858792878570952


# RANDOM FOREST ENSEMBLE REGRESSOR model testing for DAYS ON MARKET


In [82]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price']]
y = df2['days_on_market']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [83]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [84]:
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=0)

In [85]:
# Perform Grid Search to find the best parameters for Random Forest
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, params, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': [None, 10, 20, 30],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='r2')

In [86]:
# Best model
best_rf = grid_search.best_estimator_

# Fit the best Random Forest model
best_rf.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test_scaled)

In [87]:
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Mean Absolute Error: 51.71784702180536
Mean Squared Error: 4632.13118939855
R-squared: -0.04461627409508595


In [88]:
# Feature importance
feature_importance = pd.Series(best_rf.feature_importances_, index=X.columns)
print(f'Feature Importance:\n{feature_importance.sort_values(ascending=False)}')

Feature Importance:
sqft              0.270885
list_price        0.247813
lot_sqft          0.212935
hoa_fee           0.101877
beds              0.077749
full_baths        0.053728
parking_garage    0.025717
half_baths        0.009296
dtype: float64


# PRINCIPLE COMPONENT ANALYSIS (PCA) model testing for DAYS ON MARKET

In [89]:
df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']]

Unnamed: 0,beds,full_baths,half_baths,sqft,lot_sqft,hoa_fee,parking_garage,list_price
0,3.0,2.0,0.0,1387.0,8246.0,0.0,2.0,375000.0
1,1.0,1.0,0.0,704.0,12990.0,0.0,0.0,440000.0
2,4.0,2.0,0.0,1886.0,11543.0,0.0,2.0,497000.0
3,3.0,2.0,0.0,1920.0,15682.0,0.0,0.0,950000.0
4,4.0,2.0,1.0,2901.0,10415.0,4.0,2.0,955000.0
...,...,...,...,...,...,...,...,...
678,5.0,4.0,0.0,4110.0,45080.0,50.0,3.0,1100000.0
679,3.0,2.0,1.0,2294.0,9108.0,8.0,0.0,359990.0
680,4.0,2.0,1.0,2507.0,6717.0,38.0,2.0,424900.0
681,5.0,3.0,1.0,3715.0,16775.0,46.0,2.0,626000.0


In [90]:
# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df2[['beds', 'full_baths', 'half_baths','sqft','lot_sqft','hoa_fee','parking_garage','list_price']])
# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
principal_components = pca.fit_transform(df_scaled)


In [91]:
# Create a DataFrame with the principal components
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

In [92]:
print(df_pca.head())
print('Explained Variance Ratio:', pca.explained_variance_ratio_)
print('Singular Values:', pca.singular_values_)

        PC1       PC2
0 -1.089329 -0.164908
1 -3.177582  0.715522
2 -0.451122 -0.058154
3 -1.319373  0.930881
4  0.494890  0.297756
Explained Variance Ratio: [0.52315728 0.13684321]
Singular Values: [53.46523501 27.3443106 ]
