In [313]:
import pandas as pd
import numpy as np

# Data Cleaning

In [467]:
houses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/redfin_2022-06-06-11-05-25.csv")
# Remove unnecessary columns
houses = houses.drop('FAVORITE', axis=1)
houses = houses.drop(['SALE TYPE', 'SOLD DATE','ADDRESS','HOA/MONTH', 'NEXT OPEN HOUSE START TIME','NEXT OPEN HOUSE END TIME','INTERESTED','URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)','SOURCE', 'MLS#','STATUS','LOCATION','DAYS ON MARKET'], axis = 1)

#Subset to only Alexandria and without Vacant Lot
houses = houses[houses['CITY']!= 'Arlington']
houses = houses[houses['PROPERTY TYPE'] != "Vacant Land"]
houses = houses[houses['PROPERTY TYPE'] != "Condo/Co-op"]
houses = houses[houses['PROPERTY TYPE'] != "Multi-Family (2-4 Unit)"]


# EDA

In [468]:
# null values
houses.isna().sum()

PROPERTY TYPE          0
CITY                   0
STATE OR PROVINCE      0
ZIP OR POSTAL CODE     0
PRICE                  0
BEDS                   0
BATHS                  0
SQUARE FEET            0
LOT SIZE              21
YEAR BUILT             5
$/SQUARE FEET          0
LATITUDE               0
LONGITUDE              0
dtype: int64

In [469]:
# create buckets for year built
i = len(houses.index)
j = 0
while j < i:
    year = houses.iloc[j, 9]
    
    if 1700 <= year < 1900:
        houses.iloc[j, 9] = "18th-19th Century"
    elif 1900 <= year < 1950:
        houses.iloc[j, 9] = '1900-1950'
    elif 1950 <= year < 2000:
        houses.iloc[j, 9] = '1950-2000'
    elif 2000 <= year <= 2023:
        houses.iloc[j, 9] = '2000-2023'
    else:
        pass
    j = j + 1

In [470]:
# Drop null values
houses = houses.dropna()
houses

Unnamed: 0,PROPERTY TYPE,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,$/SQUARE FEET,LATITUDE,LONGITUDE
0,Single Family Residential,Alexandria,VA,22308,45000000,7.0,10.0,25500.0,435600.0,2000-2023,1765.0,38.739577,-77.045973
1,Single Family Residential,Alexandria,VA,22314,5400000,6.0,5.0,8145.0,21153.0,18th-19th Century,663.0,38.809834,-77.044992
4,Single Family Residential,Alexandria,VA,22309,3650000,4.0,5.0,5602.0,21972.0,2000-2023,652.0,38.699743,-77.116651
7,Single Family Residential,Alexandria,VA,22309,3250000,6.0,6.5,5590.0,105654.0,2000-2023,581.0,38.703304,-77.094285
8,Single Family Residential,Alexandria,VA,22314,3195000,4.0,3.5,4122.0,3116.0,18th-19th Century,775.0,38.803387,-77.044415
...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,Townhouse,Alexandria,VA,22301,825000,2.0,2.0,1383.0,2500.0,1900-1950,597.0,38.814822,-77.060530
343,Single Family Residential,Alexandria,VA,22310,825000,4.0,2.5,1952.0,8400.0,1950-2000,423.0,38.780294,-77.087991
345,Townhouse,Alexandria,VA,22315,825000,3.0,2.5,2172.0,2814.0,1950-2000,380.0,38.774994,-77.126456
347,Single Family Residential,Alexandria,VA,22315,824999,4.0,3.5,1842.0,7187.0,1950-2000,448.0,38.758442,-77.156358


In [471]:
houses['PROPERTY TYPE'].value_counts() # --> dummy vars

Single Family Residential    184
Townhouse                     76
Name: PROPERTY TYPE, dtype: int64

# Lasso (Shannon)

In [472]:
# define variables
y = houses['PRICE']
X = houses[['PROPERTY TYPE', 'BEDS','BATHS','SQUARE FEET','LOT SIZE','YEAR BUILT']]           

In [473]:
from sklearn.model_selection import train_test_split # outputs 4 diff objects

# Split data into train group and test group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =18) #random state not needed
X_columns = X_train.columns

In [474]:
len(y_test)

52

In [475]:
y_train.head(3)

126    1295000
82     1498500
267     900000
Name: PRICE, dtype: int64

In [476]:
# Encode categoricals
from sklearn.compose import make_column_transformer #allows tranformaation of columns based on given functions
from sklearn.preprocessing import OneHotEncoder #encoding into dummys

column_trans = make_column_transformer((OneHotEncoder(), [0, 5]), remainder='passthrough')



In [498]:
# Now all numerical data
X_train = column_trans.fit_transform(X_train) #fit
X_test = column_trans.transform(X_test)
print(y_test)

275      899900
226      975000
221      980000
164     1165000
31      1975000
278      899900
271      899950
204      999999
15      2575000
148     1199000
195     1020000
285      899000
91      1450000
158     1175000
246      950000
167     1150000
93      1450000
55      1645000
68      1595000
303      875000
210      999000
348      824999
64      1599000
65      1595000
219      985000
175     1100000
284      899000
276      899900
287      899000
44      1700000
135     1250000
176     1100000
119     1300000
265      924000
159     1175000
262      925000
0      45000000
137     1249000
81      1499000
215      995000
294      895000
28      1995000
178     1099900
112     1350000
161     1175000
177     1100000
7       3250000
238      959000
10      2995000
85      1495000
305      875000
280      899900
Name: PRICE, dtype: int64


In [478]:
# Scaling values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #create scaler object

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test[:1] #first row of array

#now all standardized values

array([[ 0.68175416, -0.68175416, -0.27878344, -0.52542708,  1.04929028,
        -0.55513611, -0.90652299, -0.83857917, -0.91493715, -0.52372109]])

In [518]:
# Lasso Regression Model
from sklearn import linear_model
clf = linear_model.Lasso(alpha =0.5, max_iter = 2000) #What alpha do
clf.fit(X_train, y_train)


Lasso(alpha=0.5, max_iter=2000)

In [519]:
# Coefficients of lasso model
print(clf.coef_)

[-2.03404610e+05  2.18063835e-09 -6.76974902e+05  0.00000000e+00
  1.08608640e+05  1.26811499e+04 -0.00000000e+00  1.17972292e+03
  7.61541445e+04 -3.13860696e+04 -1.47276342e+03  6.17738097e+04
  2.72472769e+05  5.56770765e+04]


In [520]:
# Intercept of lasso model
print(clf.intercept_)

1925457.1732511441


In [521]:
# predicted sale price based on lasso regression
y_predict = clf.predict(X_test)
y_predict

array([ 763694.76808702,  879979.3524322 , 1385496.38096179,
       1334410.95011881, 1404158.05937358,  777961.53442407,
       1097560.31702238, 1463926.01437566, 1430084.08467765,
       1519463.21012776, 1482289.26621147,  966090.31294316,
       1254999.99618471, 1063756.36224984, 1168108.85473689,
       1334410.95011881, 1397390.52100115, 1783908.11574838,
       1397988.72993675, 1080769.69891745,  854043.39185631,
       1078881.65322139, 1631649.63594666, 1468576.37076217,
        967456.98581335,  938754.18912466,  804494.3733087 ,
       1381351.92408377, 1187383.05645832, 1811751.9260096 ,
       1086133.39440174, 1001406.41818359, 1865515.61269441,
       1152231.74095432, 1694960.59450607,  881206.47260674,
       8150239.43405658, 1477925.76589906, 1654993.09967436,
       1325080.79422486, 1052959.08177228, 1377939.72264956,
       1256243.48370827, 1287909.40376375, 1265623.17677348,
        887407.44807057, 2295387.18130882, 1058038.61670076,
       1886091.75468668,

In [522]:
# true sale price
y_true = y_test

In [523]:
# r2
from sklearn.metrics import r2_score
r2_score(y_true, y_predict)

0.27747307563910306

In [525]:
# R2 at different alphas
moving_alpha = .05
while moving_alpha < 1:
    clf = linear_model.Lasso(alpha = moving_alpha, max_iter = 2000) #What alpha do
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    score = r2_score(y_true, y_predict)
    print("alpha value: " + str(moving_alpha) + " / r2 score: " + str(score))
    moving_alpha += .05

alpha value: 0.05 / r2 score: 0.2774748095003856
alpha value: 0.1 / r2 score: 0.27747461681140906
alpha value: 0.15000000000000002 / r2 score: 0.2774744245276344
alpha value: 0.2 / r2 score: 0.27747423121170756
alpha value: 0.25 / r2 score: 0.2774740395649252
alpha value: 0.3 / r2 score: 0.2774738462489301
alpha value: 0.35 / r2 score: 0.2774736555873917
alpha value: 0.39999999999999997 / r2 score: 0.27747346227132474
alpha value: 0.44999999999999996 / r2 score: 0.2774732689552286
alpha value: 0.49999999999999994 / r2 score: 0.27747307563910306
alpha value: 0.5499999999999999 / r2 score: 0.2774728823229472
alpha value: 0.6 / r2 score: 0.27747269315983103
alpha value: 0.65 / r2 score: 0.27747249984359845
alpha value: 0.7000000000000001 / r2 score: 0.27747230652733623
alpha value: 0.7500000000000001 / r2 score: 0.2774721132110449
alpha value: 0.8000000000000002 / r2 score: 0.27747191989472375
alpha value: 0.8500000000000002 / r2 score: 0.27747172657837293
alpha value: 0.9000000000000002 

# Lasso (John)

In [245]:
import sklearn
from sklearn import linear_model

In [246]:
model = linear_model.Lasso(alpha=1.0)

In [247]:
from numpy import arange
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [248]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [249]:
X,y= houses[['BEDS','BATHS','SQUARE FEET', 'LOT SIZE','YEAR BUILT']], houses[['PRICE']]
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean MAE: nan (nan)


Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py", line 771, in fit
    X, y = self._validate_data(X, y, accept_sparse='csc',
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Ap

  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 673, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 1993, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: '2000-2023'

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_a

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py", line 771, in fit
    X, y = self._validate_data(X, y, accept_sparse='csc',
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Ap

In [242]:
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)

rn f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 673, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 1993, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: '2000-2023'

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages

In [243]:
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [244]:
from pandas import read_csv
from sklearn.linear_model import Lasso
# load the dataset
# define model
model = Lasso(alpha=1.0)
# fit model
model.fit(X, y)
# define new data
row = [2,2,10000,2000,2000]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted: %.3f' % yhat)

ValueError: could not convert string to float: '2000-2023'

In [None]:
row = [2,2,10000,2000,2020]
# make a prediction
yhat = model.predict([row])
print('Predicted: %.3f' % yhat)