In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pydataset
from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from env import get_db_url

import warnings
warnings.filterwarnings('ignore')

In [37]:
url = get_db_url('zillow')
query = '''
            
SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips, transactiondate, regionidzip
FROM properties_2017

LEFT JOIN propertylandusetype USING(propertylandusetypeid)
JOIN predictions_2017 USING (parcelid)

WHERE propertylandusedesc IN ("Single Family Residential",                       
                              "Inferred Single Family Residential")'''


data = pd.read_sql(query, url)
df = data.copy() #use copy to explore on
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,transactiondate,regionidzip
0,4.0,3.5,3100.0,1023282.0,1998.0,11013.72,6059.0,2017-01-01,96978.0
1,2.0,1.0,1465.0,464000.0,1967.0,5672.48,6111.0,2017-01-01,97099.0
2,3.0,2.0,1243.0,564778.0,1962.0,6488.3,6059.0,2017-01-01,97078.0
3,4.0,3.0,2376.0,145143.0,1970.0,1777.51,6037.0,2017-01-01,96330.0
4,4.0,3.0,2962.0,773303.0,1950.0,9516.26,6037.0,2017-01-01,96293.0


In [38]:
#df.regionidzip.to_list()

In [39]:
df.isna().sum()

bedroomcnt                        0
bathroomcnt                       0
calculatedfinishedsquarefeet     82
taxvaluedollarcnt                 1
yearbuilt                       116
taxamount                         4
fips                              0
transactiondate                   0
regionidzip                      26
dtype: int64

In [40]:
#what is the percentage that would be left if null values dropped?

round(df.dropna().shape[0] / df.shape[0], 4)

0.9972

In [41]:
 #rename df columns
df = df.rename(columns = {'bedroomcnt': 'bedrooms',
                              'bathroomcnt': 'bathrooms',
                              'calculatedfinishedsquarefeet': 'area',
                              'taxvaluedollarcnt': 'tax_value',
                              'yearbuilt': 'year_built',
                              'taxamount': 'tax_amount',
                              'fips': 'county',
                              'transactiondate': 'transaction_date',
                              'regionidzip': 'zip_code'})

In [42]:
#prepwork

#change fips to categorical using map to show county info:

df['county'] = df.county.map({6037.0: 'LA', 6059.0: 'OC', 6111.0: 'VC'})#.astype(str)


#change year_built and zip_code to categorical:

#get rid of zeros
df.year_built = df.round().year_built.astype('Int64')
df.zip_code = df.round().zip_code.astype('Int64')

#change to object

df.year_built = df.year_built.astype(object)
df.zip_code = df.zip_code.astype(object)

df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,tax_amount,county,transaction_date,zip_code
0,4.0,3.5,3100.0,1023282.0,1998,11013.72,OC,2017-01-01,96978
1,2.0,1.0,1465.0,464000.0,1967,5672.48,VC,2017-01-01,97099
2,3.0,2.0,1243.0,564778.0,1962,6488.3,OC,2017-01-01,97078
3,4.0,3.0,2376.0,145143.0,1970,1777.51,LA,2017-01-01,96330
4,4.0,3.0,2962.0,773303.0,1950,9516.26,LA,2017-01-01,96293


In [43]:

def discard_outliers(df, k, col_list):
    
    for col in col_list:
        #obtain quartiles
        q1, q3 = df[col].quantile([.25, .75]) 
        
        #obtain iqr range
        iqr = q3 - q1
        
        upper_bound = q3 + k * iqr
        lower_bound = q1 - k * iqr
        
        #return outlier - free df
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]        
    return df

In [44]:
df.isna().sum()

bedrooms              0
bathrooms             0
area                 82
tax_value             1
year_built          116
tax_amount            4
county                0
transaction_date      0
zip_code             26
dtype: int64

In [45]:
df.zip_code.nunique()

381

In [46]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,tax_amount
count,52442.0,52442.0,52360.0,52441.0,52438.0
mean,3.300675,2.299397,1922.874083,529683.0,6453.451945
std,0.949086,1.022764,1004.363491,751888.3,8755.495483
min,0.0,0.0,128.0,1000.0,49.18
25%,3.0,2.0,1268.0,193750.0,2656.25
50%,3.0,2.0,1659.0,373612.0,4647.73
75%,4.0,3.0,2306.0,619301.0,7377.445
max,14.0,18.0,21929.0,49061240.0,586639.3


In [47]:
#use function get rid of outliers
df = discard_outliers(df, 1.5, ['bedrooms', 'bathrooms', 'area', 'tax_value', 'tax_amount'])

In [48]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,tax_amount
count,44885.0,44885.0,44885.0,44885.0,44885.0
mean,3.207976,2.094898,1694.222569,363597.7,4505.706995
std,0.788015,0.723452,605.801071,233628.1,2637.418915
min,2.0,1.0,152.0,1000.0,49.18
25%,3.0,2.0,1238.0,171482.0,2413.76
50%,3.0,2.0,1568.0,331369.0,4182.74
75%,4.0,2.5,2051.0,511511.0,6165.74
max,5.0,4.0,3569.0,1122409.0,12060.53


In [49]:
df.isna().sum()

bedrooms             0
bathrooms            0
area                 0
tax_value            0
year_built          20
tax_amount           0
county               0
transaction_date     0
zip_code            12
dtype: int64

In [50]:
df.shape

(44885, 9)

In [51]:
#eliminate values that did not occur in 2017
df = df[(df.transaction_date <= '2017-12-31')]

In [52]:
#can impute mode or drop missing values

df.year_built.mode()

0    1955
dtype: object

In [53]:
#can impute mode or drop missing values
df.zip_code.mode()

0    97319
dtype: object

In [54]:
df.zip_code.nunique()

377

In [55]:
df = df.dropna()

In [57]:
df.shape

(44853, 9)

### Train, Test, Split

In [58]:
train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)
print(train.shape, validate.shape, test.shape)

(25117, 9) (10765, 9) (8971, 9)


In [59]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

In [60]:
#create list of columns to scale
columns_to_scale = ['bedrooms', 'bathrooms', 'area', 'tax_value']

#create variable and set to train copy so original data remains untouched
train_scaled = train.copy()

#create/choose scaler
scaler = MinMaxScaler()

#fit_transform scaler to train data 
train_scaled[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

train_scaled.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,tax_amount,county,transaction_date,zip_code
45708,0.333333,0.333333,0.233246,0.298731,1994,3955.65,LA,2017-08-17,96216
48451,0.333333,0.333333,0.261048,0.322674,1958,4401.95,LA,2017-08-30,96488
49291,0.333333,0.333333,0.614867,0.310809,1963,3976.98,OC,2017-09-01,97008
15753,0.333333,0.0,0.267486,0.194833,1953,2869.64,OC,2017-04-04,97050
24257,0.666667,0.666667,0.62511,0.468161,1950,6704.14,LA,2017-05-15,96193


In [70]:
X_train_scaled = train_scaled.drop('tax_value', axis = 1)
y_train = train['tax_value']

In [80]:
#drop non-numeric columns or encode them for feature engineering

X_train_scaled = X_train_scaled.drop(['year_built', 'county', 'transaction_date','zip_code'], axis = 1)

KeyError: "['year_built' 'county' 'transaction_date' 'zip_code'] not found in axis"

In [76]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 3 features
f_selector = SelectKBest(f_regression, k = 3)

# find the top 3 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()


In [77]:
f_feature

['bathrooms', 'area', 'tax_amount']

In [78]:
#from sklearn.linear_model import LinearRegression
#from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select = 3)

# fit the data using RFE
rfe.fit(X_train_scaled , y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [79]:
rfe_feature

['bedrooms', 'bathrooms', 'area']