In [534]:
import pandas as pd
import numpy as np

# Data Cleaning

In [535]:
houses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/redfin_2022-06-06-11-05-25.csv")
# Remove unnecessary columns
houses = houses.drop('FAVORITE', axis=1)
houses = houses.drop(['SALE TYPE', 'SOLD DATE','ADDRESS','HOA/MONTH', 'NEXT OPEN HOUSE START TIME','NEXT OPEN HOUSE END TIME','INTERESTED','URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)','SOURCE', 'MLS#','STATUS','LOCATION','DAYS ON MARKET'], axis = 1)

#Subset to only Alexandria and without Vacant Lot
houses = houses[houses['CITY']!= 'Arlington']
houses = houses[houses['PROPERTY TYPE'] != "Vacant Land"]
houses = houses[houses['PROPERTY TYPE'] != "Condo/Co-op"]
houses = houses[houses['PROPERTY TYPE'] != "Multi-Family (2-4 Unit)"]


# EDA

In [536]:
# null values
houses.isna().sum()

PROPERTY TYPE          0
CITY                   0
STATE OR PROVINCE      0
ZIP OR POSTAL CODE     0
PRICE                  0
BEDS                   0
BATHS                  0
SQUARE FEET            0
LOT SIZE              21
YEAR BUILT             5
$/SQUARE FEET          0
LATITUDE               0
LONGITUDE              0
dtype: int64

In [537]:
# create buckets for year built
i = len(houses.index)
j = 0
while j < i:
    year = houses.iloc[j, 9]
    
    if 1700 <= year < 1900:
        houses.iloc[j, 9] = "18th-19th Century"
    elif 1900 <= year < 1950:
        houses.iloc[j, 9] = '1900-1950'
    elif 1950 <= year < 2000:
        houses.iloc[j, 9] = '1950-2000'
    elif 2000 <= year <= 2023:
        houses.iloc[j, 9] = '2000-2023'
    else:
        pass
    j = j + 1

In [538]:
# Drop null values
houses = houses.dropna()
houses

Unnamed: 0,PROPERTY TYPE,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,$/SQUARE FEET,LATITUDE,LONGITUDE
0,Single Family Residential,Alexandria,VA,22308,45000000,7.0,10.0,25500.0,435600.0,2000-2023,1765.0,38.739577,-77.045973
1,Single Family Residential,Alexandria,VA,22314,5400000,6.0,5.0,8145.0,21153.0,18th-19th Century,663.0,38.809834,-77.044992
4,Single Family Residential,Alexandria,VA,22309,3650000,4.0,5.0,5602.0,21972.0,2000-2023,652.0,38.699743,-77.116651
7,Single Family Residential,Alexandria,VA,22309,3250000,6.0,6.5,5590.0,105654.0,2000-2023,581.0,38.703304,-77.094285
8,Single Family Residential,Alexandria,VA,22314,3195000,4.0,3.5,4122.0,3116.0,18th-19th Century,775.0,38.803387,-77.044415
...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,Townhouse,Alexandria,VA,22301,825000,2.0,2.0,1383.0,2500.0,1900-1950,597.0,38.814822,-77.060530
343,Single Family Residential,Alexandria,VA,22310,825000,4.0,2.5,1952.0,8400.0,1950-2000,423.0,38.780294,-77.087991
345,Townhouse,Alexandria,VA,22315,825000,3.0,2.5,2172.0,2814.0,1950-2000,380.0,38.774994,-77.126456
347,Single Family Residential,Alexandria,VA,22315,824999,4.0,3.5,1842.0,7187.0,1950-2000,448.0,38.758442,-77.156358


In [539]:
houses['PROPERTY TYPE'].value_counts() # --> dummy vars

Single Family Residential    184
Townhouse                     76
Name: PROPERTY TYPE, dtype: int64

In [673]:
houses['ZIP CODE OR POSTAL CODE']

KeyError: 'ZIP CODE OR POSTAL CODE'

# Regression (Shannon)

In [661]:
# define variables
y = houses['PRICE']
X = houses[['PROPERTY TYPE', 'BEDS','BATHS','SQUARE FEET','LOT SIZE','YEAR BUILT']]           

In [662]:
from sklearn.model_selection import train_test_split # outputs 4 diff objects

# Split data into train group and test group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =18) #random state not needed
X_columns = X_train.columns

In [663]:
len(y_test) #rows in testing subset

52

In [664]:
y_train.head(3)

126    1295000
82     1498500
267     900000
Name: PRICE, dtype: int64

In [666]:
# Encode categoricals
from sklearn.compose import make_column_transformer #allows tranformaation of columns based on given functions
from sklearn.preprocessing import OneHotEncoder #encoding into dummys

column_trans = make_column_transformer((OneHotEncoder(), [0,5]), remainder='passthrough')

# Now all numerical data
X_train = column_trans.fit_transform(X_train) #fit
X_test = column_trans.transform(X_test)

In [667]:
# Scaling values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #create scaler object

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test[:1] #first row of array

#now all standardized values

array([[ 0.68175416, -0.68175416, -0.90652299, -0.83857917, -0.91493715,
        -0.52372109]])

## Regression function

In [668]:
from sklearn import linear_model
from sklearn import svm

In [669]:
def ml_function(model): #add alpha and l1 ratio
    if model == "lasso":
        clf = linear_model.Lasso(alpha = 0.5, max_iter = 10000) #What alpha do
    elif model == "ridge":
        clf = linear_model.Ridge(alpha = 0.5, max_iter = 10000) 
    elif model == "elastic_net":
        clf = linear_model.ElasticNet(alpha = 0.2, l1_ratio = 0.9, max_iter = 10000)
    elif model == "linear_regression":
        clf = linear_model.LinearRegression() #no alpha value.... must fix... default?
    elif model == "svr_linear":
        clf = svm.SVR(kernel = 'linear')
    clf.fit(X_train, y_train)
    print(model + " " + "Coefficients:   " + str(clf.coef_) + "\n")
    print(model + " " + "Intercept:   " + str(clf.intercept_) + "\n")
    y_predict = clf.predict(X_test)
    y_true = y_test
    print(model + " " + "R2: " + str(r2_score(y_true, y_predict)))


In [671]:
ml_function("lasso")

lasso Coefficients:   [-105840.9242983        0.           -7689.66579034     703.47123305
  330229.37957458   44853.58327401]

lasso Intercept:   1242279.860576923

lasso R2: 0.28489524959842005


## Lasso (Shannon, OG)

In [557]:
# Lasso Regression Model
from sklearn import linear_model
clf_lasso = linear_model.Lasso(alpha =0.5, max_iter = 100000) #What alpha do
clf_lasso.fit(X_train, y_train)


Lasso(alpha=0.5, max_iter=100000)

In [558]:
# Coefficients of lasso model
print(clf_lasso.coef_)

[-45300.11722706      0.         178401.09807099  81376.33486057
 -25052.00914411     -0.          -1472.8298891   61774.54463375
 272472.40139969  55677.14829941]


In [559]:
# Intercept of lasso model
print(clf_lasso.intercept_)

1242279.860576923


In [560]:
# predicted sale price based on lasso regression
y_predict = clf_lasso.predict(X_test)
y_predict

array([ 763694.77386153,  879979.21713357, 1385495.11000377,
       1334410.12794907, 1404158.78332785,  777960.9221951 ,
       1097559.79656619, 1463929.87325758, 1430084.23098219,
       1519462.14938308, 1482293.49437385,  966090.4777699 ,
       1254999.91781285, 1063756.37592057, 1168109.50462132,
       1334410.12794907, 1397390.47849471, 1783907.28136851,
       1397989.13014619, 1080769.36151399,  854042.83218604,
       1078879.84927599, 1631649.59717081, 1468576.47793594,
        967455.73173795,  938754.37289135,  804494.67004484,
       1381351.11502052, 1187383.06766003, 1811756.73396111,
       1086133.79184603, 1001406.69677444, 1865516.06117235,
       1152232.15985788, 1694961.4060609 ,  881207.24916901,
       8150238.97613612, 1477925.54533021, 1654997.37676866,
       1325080.78443656, 1052959.45188972, 1377940.71344658,
       1256243.56294537, 1287909.15536113, 1265622.94301493,
        887406.5992028 , 2295387.82728057, 1058039.0015556 ,
       1886093.81807087,

In [561]:
# true sale price
y_true = y_test

In [562]:
# r2
from sklearn.metrics import r2_score
r2_score(y_true, y_predict)

0.27747305782137244

In [564]:
# R2 at different alphas
moving_alpha = .05
while moving_alpha < 1:
    clf_lasso = linear_model.Lasso(alpha = moving_alpha, max_iter = 100000) #What alpha do
    clf_lasso.fit(X_train, y_train)
    y_predict = clf_lasso.predict(X_test)
    score = r2_score(y_true, y_predict)
    print("alpha value: " + str(moving_alpha) + " / r2 score: " + str(score))
    moving_alpha += .05

  model = cd_fast.enet_coordinate_descent(


alpha value: 0.05 / r2 score: 0.27747482823443914
alpha value: 0.1 / r2 score: 0.2774746134797976
alpha value: 0.15000000000000002 / r2 score: 0.2774744201438155
alpha value: 0.2 / r2 score: 0.2774742245653604
alpha value: 0.25 / r2 score: 0.27747403010809946
alpha value: 0.3 / r2 score: 0.277473836541287
alpha value: 0.35 / r2 score: 0.27747364260803054
alpha value: 0.39999999999999997 / r2 score: 0.27747344673614704
alpha value: 0.44999999999999996 / r2 score: 0.27747325474142204
alpha value: 0.49999999999999994 / r2 score: 0.27747305782137244
alpha value: 0.5499999999999999 / r2 score: 0.277472866273733
alpha value: 0.6 / r2 score: 0.27747267294129985
alpha value: 0.65 / r2 score: 0.27747247900787364
alpha value: 0.7000000000000001 / r2 score: 0.2774722850744138
alpha value: 0.7500000000000001 / r2 score: 0.27747208773887033
alpha value: 0.8000000000000002 / r2 score: 0.27747189961130425
alpha value: 0.8500000000000002 / r2 score: 0.2774716988237066
alpha value: 0.9000000000000002 /

# Lasso (John)

In [245]:
import sklearn
from sklearn import linear_model

In [246]:
model = linear_model.Lasso(alpha=1.0)

In [247]:
from numpy import arange
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [248]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [249]:
X,y= houses[['BEDS','BATHS','SQUARE FEET', 'LOT SIZE','YEAR BUILT']], houses[['PRICE']]
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean MAE: nan (nan)


Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py", line 771, in fit
    X, y = self._validate_data(X, y, accept_sparse='csc',
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Ap

  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 673, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 1993, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: '2000-2023'

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_a

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py", line 771, in fit
    X, y = self._validate_data(X, y, accept_sparse='csc',
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Ap

In [242]:
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)

rn f(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 673, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 1993, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "/Applications/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: '2000-2023'

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages

In [243]:
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [244]:
from pandas import read_csv
from sklearn.linear_model import Lasso
# load the dataset
# define model
model = Lasso(alpha=1.0)
# fit model
model.fit(X, y)
# define new data
row = [2,2,10000,2000,2000]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted: %.3f' % yhat)

ValueError: could not convert string to float: '2000-2023'

In [None]:
row = [2,2,10000,2000,2020]
# make a prediction
yhat = model.predict([row])
print('Predicted: %.3f' % yhat)