In [34]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer,RobustScaler,MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

import acquire as a
import prepare as p

In [2]:
df = a.get_zillow_data()

In [3]:
df.columns

Index(['parcelid', 'logerror', 'transactiondate', 'id', 'parcelid',
       'airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft',
       'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypei

In [4]:
df = df[['bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet', 'taxvaluedollarcnt']]

In [5]:
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,lotsizesquarefeet,taxvaluedollarcnt
0,3.0,4.0,13038.0,145143.0
1,3.0,4.0,63000.0,773303.0
2,1.0,2.0,4214.0,218552.0
3,3.0,4.0,20028.0,220583.0
4,3.0,4.0,10384.0,430108.0


In [6]:
df = df.dropna()

In [7]:
df.isna().sum()

bathroomcnt          0
bedroomcnt           0
lotsizesquarefeet    0
taxvaluedollarcnt    0
dtype: int64

In [19]:
train, test = p.split_my_data(df, train_ratio = .80, seed = 123)

# Unscaled

In [20]:
# split into X and y
X_train = train[['bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
y_train = train[['taxvaluedollarcnt']]

X_test = test[['bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
y_test = test[['taxvaluedollarcnt']]


mean_tax_value = int(y_train['taxvaluedollarcnt'].mean(axis=0))
y_test[['yhat']] = mean_tax_value

In [38]:
y_test

Unnamed: 0,taxvaluedollarcnt,yhat
33022,403691.0,494688
33690,97499.0,494688
33226,751284.0,494688
12740,252745.0,494688
25728,49031.0,494688
...,...,...
2691,190866.0,494688
26566,83212.0,494688
31772,646000.0,494688
22648,693683.0,494688


In [37]:
baseline_rmse_test = mean_squared_error(y_test.taxvaluedollarcnt, y_test.yhat)**1/2
evs = explained_variance_score(y_test.taxvaluedollarcnt, y_test.yhat)

evs

0.0

In [22]:
lr = LinearRegression(normalize=True)

lr.fit(X_train,y_train.taxvaluedollarcnt)

y_test['predictions'] = lr.predict(X_test)

rmse_test = mean_squared_error(y_test.taxvaluedollarcnt, y_test.predictions)**1/2

rmse_test

264531634146.2733

In [23]:
rmse_test < baseline_rmse_test

True

# Scaled

In [13]:
train, test = p.split_my_data(df, train_ratio = .80, seed = 123)

In [30]:
# split into X and y
X_train = train[['bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
y_train = train[['taxvaluedollarcnt']]

X_test = test[['bathroomcnt', 'bedroomcnt', 'lotsizesquarefeet']]
y_test = test[['taxvaluedollarcnt']]

# scale data
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_test_scaled = scaler.transform(X_test)

# get baseline
mean_tax_value = int(y_train['taxvaluedollarcnt'].mean(axis=0))
y_test[['yhat']] = mean_tax_value

In [27]:
X_test.head()

Unnamed: 0,bathroomcnt,bedroomcnt,lotsizesquarefeet
33022,2.0,3.0,7754.0
33690,2.0,2.0,12000.0
33226,3.0,4.0,5188.0
12740,2.0,3.0,7544.0
25728,2.0,3.0,8822.0


In [32]:
X_test_scaled

array([[0.08333333, 0.2       , 0.00097818],
       [0.08333333, 0.1       , 0.00158736],
       [0.16666667, 0.3       , 0.00061004],
       ...,
       [0.08333333, 0.2       , 0.00071118],
       [0.08333333, 0.2       , 0.00091721],
       [0.16666667, 0.3       , 0.00316295]])

In [25]:
baseline_rmse_test = mean_squared_error(y_test.taxvaluedollarcnt, y_test.yhat)**1/2
baseline_rmse_test

353833722184.4765

In [26]:
lr = LinearRegression(normalize=True)

lr.fit(X_train_scaled,y_train.taxvaluedollarcnt)

y_test['predictions_scaled'] = lr.predict(X_test_scaled)

rmse_test_ = mean_squared_error(y_test.taxvaluedollarcnt, y_test.predictions_scaled)**1/2

rmse_test

264531634146.2733

In [17]:
rmse_test < baseline_rmse_test

True

In [18]:
y_test

Unnamed: 0,taxvaluedollarcnt,yhat,predictions,predictions_scaled
33022,403691.0,494688,432551.266043,432551.266043
33690,97499.0,494688,520080.951531,520080.951531
33226,751284.0,494688,768294.307521,768294.307521
12740,252745.0,494688,432545.976015,432545.976015
25728,49031.0,494688,432578.169612,432578.169612
...,...,...,...,...
2691,190866.0,494688,432469.195043,432469.195043
26566,83212.0,494688,519930.336888,519930.336888
31772,646000.0,494688,432504.386322,432504.386322
22648,693683.0,494688,432540.560034,432540.560034
