In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer,RobustScaler,MinMaxScaler


import warnings
warnings.filterwarnings("ignore")

import acquire as a
import prepare as p

In [2]:
df = a.get_zillow_data()

In [3]:
df.columns

Index(['parcelid', 'logerror', 'transactiondate', 'id', 'parcelid',
       'airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft',
       'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypei

In [4]:
df = df[['bathroomcnt', 'calculatedfinishedsquarefeet', 'taxvaluedollarcnt']]

In [5]:
df.head()

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,3.0,2376.0,145143.0
1,3.0,2962.0,773303.0
2,1.0,738.0,218552.0
3,3.0,3039.0,220583.0
4,3.0,2540.0,430108.0


In [6]:
df = df.dropna()

In [7]:
df.isna().sum()

bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
dtype: int64

In [8]:
df.head()

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,3.0,2376.0,145143.0
1,3.0,2962.0,773303.0
2,1.0,738.0,218552.0
3,3.0,3039.0,220583.0
4,3.0,2540.0,430108.0


In [9]:
train, test = p.split_my_data(df, train_ratio = .80, seed = 123)

# Unscaled

In [21]:
# split into X and y
X_train = train[['bathroomcnt', 'calculatedfinishedsquarefeet']]
y_train = train[['taxvaluedollarcnt']]

X_test = test[['bathroomcnt', 'calculatedfinishedsquarefeet']]
y_test = test[['taxvaluedollarcnt']]

# get baseline value for X and y
mean_tax_value = int(y_train['taxvaluedollarcnt'].mean(axis=0))
y_test[['yhat']] = mean_tax_value

baseline_rmse = mean_squared_error(y_test.taxvaluedollarcnt, y_test.yhat)**1/2
print(f"Baseline RMSE: {baseline_rmse}\n")

# create/fit regression model on unscaled data
lr = LinearRegression(normalize=True)

lr.fit(X_train,y_train.taxvaluedollarcnt)

# get predictions
y_test['predictions'] = lr.predict(X_test)

# get error and coefficients for unscaled data
rmse_test = mean_squared_error(y_test.taxvaluedollarcnt, y_test.predictions)**1/2
coef = lr.coef_

print(f"RMSE Unscaled: {rmse_test}")
print(f"Coefficient Array Unscaled: {coef}\n")

Baseline RMSE: 307308915996.9734
RMSE Unscaled: 206959322106.87646
Coefficient Array Unscaled: [290.71254163 473.91435609]


# Scaled

In [22]:
def scaler_min_max(df, col_list):
    df_2 = df[col_list]
    df = df.drop(columns = col_list)
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(df_2)
    df_2 = pd.DataFrame(scaler.transform(df_2), columns=df_2.columns.values).set_index([df_2.index.values])
    df = df.join(df_2)
    return df

In [23]:
# scale X data
X_train_scaled = scaler_min_max(X_train, ['bathroomcnt', 'calculatedfinishedsquarefeet'])
X_test_scaled = scaler_min_max(X_test, ['bathroomcnt', 'calculatedfinishedsquarefeet'])

# create and fit object to scaled data
lr2 = LinearRegression(normalize=True)
lr2.fit(X_train_scaled, y_train.taxvaluedollarcnt)

# get predictions
y_test['predictions_scaled'] = lr2.predict(X_test_scaled)

# get error and coefficients for scaled data
rmse_test = mean_squared_error(y_test.taxvaluedollarcnt, y_test.predictions_scaled)**1/2

coef = lr.coef_

print(f"RMSE scaled: {rmse2_test}")
print(f"Coefficient Array scaled: {coef2}\n")

RMSE scaled: 476820257762.3919
Coefficient Array scaled: [3.48855050e+03 1.01536151e+07]

