In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

import acquire
import prepare
import env

from wrangle_zillow import wrangle_zillow_data

## acquire

In [2]:
query ='''
select 
    prop.parcelid
    , pred.logerror
    , pred.transactiondate
    , bathroomcnt
    , bedroomcnt
    , calculatedfinishedsquarefeet
    , fips
    , latitude
    , longitude
    , lotsizesquarefeet
    , regionidcity
    , regionidcounty
    , regionidneighborhood
    , regionidzip
    , yearbuilt
    , structuretaxvaluedollarcnt
    , taxvaluedollarcnt
    , landtaxvaluedollarcnt
    , taxamount
from properties_2017 prop
inner join predictions_2017 pred on prop.parcelid = pred.parcelid
where propertylandusetypeid = 261;
'''

df = pd.read_sql(query, env.get_url('zillow'))

In [3]:
#df = pd.read_csv('zillow_data.csv')

In [4]:
#df = df.drop(columns=['Unnamed: 0'])

### Goal: Improve our original estimate of the log error by using clustering methodologies.

## Acquisition, Prep, and Initial Exploration
Using the notebook and files you created during the exercises make any changes, additions, etc. you want at this point. NOTE: You will NOT be splitting into train and test at this point.

Ideas:

   1. Data types:

        - Write a function that takes in a dataframe and a list of column names and returns the dataframe with the datatypes of those columns changed to a non-numeric type.

In [5]:
df.head()

Unnamed: 0,parcelid,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,regionidcounty,regionidneighborhood,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount
0,14297519,0.025595,2017-01-01,3.5,4.0,3100.0,6059.0,33634931.0,-117869207.0,4506.0,53571.0,1286.0,,96978.0,1998.0,485713.0,1023282.0,537569.0,11013.72
1,17052889,0.055619,2017-01-01,1.0,2.0,1465.0,6111.0,34449266.0,-119281531.0,12647.0,13091.0,2061.0,,97099.0,1967.0,88000.0,464000.0,376000.0,5672.48
2,14186244,0.005383,2017-01-01,2.0,3.0,1243.0,6059.0,33886168.0,-117823170.0,8432.0,21412.0,1286.0,,97078.0,1962.0,85289.0,564778.0,479489.0,6488.3
3,12177905,-0.10341,2017-01-01,3.0,4.0,2376.0,6037.0,34245180.0,-118240722.0,13038.0,396551.0,3101.0,,96330.0,1970.0,108918.0,145143.0,36225.0,1777.51
4,12095076,-0.001011,2017-01-01,3.0,4.0,2962.0,6037.0,34145202.0,-118179824.0,63000.0,47019.0,3101.0,274684.0,96293.0,1950.0,276684.0,773303.0,496619.0,9516.26


In [6]:
# it works!
cols = ['bathroomcnt', 'latitude']
def change_data(df, cols):
    """
    takes a dataframe and a list of columns and it 
    converts the columns listed that are in the dataframe into 
    objects in the same dataframe
    """
    newdf = pd.DataFrame(df, columns=cols)
    df = df.drop(columns=cols)
    newdf = newdf.astype(object)
    df = pd.concat([df, newdf], axis=1)
    return df
    
        



#zillow = df.copy()
#zillow = zillow.astype(object)

   - Use this function to appropriately transform any numeric columns that should not be treated as numbers.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52442 entries, 0 to 52441
Data columns (total 19 columns):
parcelid                        52442 non-null int64
logerror                        52442 non-null float64
transactiondate                 52442 non-null object
bathroomcnt                     52442 non-null float64
bedroomcnt                      52442 non-null float64
calculatedfinishedsquarefeet    52360 non-null float64
fips                            52442 non-null float64
latitude                        52442 non-null float64
longitude                       52442 non-null float64
lotsizesquarefeet               52073 non-null float64
regionidcity                    51405 non-null float64
regionidcounty                  52442 non-null float64
regionidneighborhood            19033 non-null float64
regionidzip                     52416 non-null float64
yearbuilt                       52326 non-null float64
structuretaxvaluedollarcnt      52358 non-null float64
taxvaluedollar

>upon more inspection it looks like we could use this function to convert zipcodes parcelid id fips, rawcensustractbloc, regionid's,

In [8]:
df.head()

Unnamed: 0,parcelid,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,regionidcounty,regionidneighborhood,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount
0,14297519,0.025595,2017-01-01,3.5,4.0,3100.0,6059.0,33634931.0,-117869207.0,4506.0,53571.0,1286.0,,96978.0,1998.0,485713.0,1023282.0,537569.0,11013.72
1,17052889,0.055619,2017-01-01,1.0,2.0,1465.0,6111.0,34449266.0,-119281531.0,12647.0,13091.0,2061.0,,97099.0,1967.0,88000.0,464000.0,376000.0,5672.48
2,14186244,0.005383,2017-01-01,2.0,3.0,1243.0,6059.0,33886168.0,-117823170.0,8432.0,21412.0,1286.0,,97078.0,1962.0,85289.0,564778.0,479489.0,6488.3
3,12177905,-0.10341,2017-01-01,3.0,4.0,2376.0,6037.0,34245180.0,-118240722.0,13038.0,396551.0,3101.0,,96330.0,1970.0,108918.0,145143.0,36225.0,1777.51
4,12095076,-0.001011,2017-01-01,3.0,4.0,2962.0,6037.0,34145202.0,-118179824.0,63000.0,47019.0,3101.0,274684.0,96293.0,1950.0,276684.0,773303.0,496619.0,9516.26


In [9]:
cols = ['parcelid','fips', 'regionidcity', 'regionidcounty', 'regionidzip',
       'regionidneighborhood',]
zillow = change_data(df, cols)

In [10]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52442 entries, 0 to 52441
Data columns (total 19 columns):
logerror                        52442 non-null float64
transactiondate                 52442 non-null object
bathroomcnt                     52442 non-null float64
bedroomcnt                      52442 non-null float64
calculatedfinishedsquarefeet    52360 non-null float64
latitude                        52442 non-null float64
longitude                       52442 non-null float64
lotsizesquarefeet               52073 non-null float64
yearbuilt                       52326 non-null float64
structuretaxvaluedollarcnt      52358 non-null float64
taxvaluedollarcnt               52441 non-null float64
landtaxvaluedollarcnt           52441 non-null float64
taxamount                       52438 non-null float64
parcelid                        52442 non-null object
fips                            52442 non-null object
regionidcity                    51405 non-null object
regionidcounty 

   2. Missing Values: Impute the values in land square feet.

In [11]:
zillow.sample()

Unnamed: 0,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,parcelid,fips,regionidcity,regionidcounty,regionidzip,regionidneighborhood
11706,0.038797,2017-03-15,2.0,4.0,1380.0,33753379.0,-117974131.0,7620.0,1955.0,118512.0,512000.0,393488.0,6103.0,13945692,6059,34780,1286,96990,


In [12]:
round(zillow.isna().sum()/len(zillow)*100,2)

logerror                         0.00
transactiondate                  0.00
bathroomcnt                      0.00
bedroomcnt                       0.00
calculatedfinishedsquarefeet     0.16
latitude                         0.00
longitude                        0.00
lotsizesquarefeet                0.70
yearbuilt                        0.22
structuretaxvaluedollarcnt       0.16
taxvaluedollarcnt                0.00
landtaxvaluedollarcnt            0.00
taxamount                        0.01
parcelid                         0.00
fips                             0.00
regionidcity                     1.98
regionidcounty                   0.00
regionidzip                      0.05
regionidneighborhood            63.71
dtype: float64

   3. Missing Values: Of the remaining missing values, can they be imputed or otherwise estimated?

        - Impute those that can be imputed with the method you feel best fits the attribute.
        - Decide whether to remove the rows or columns of any that cannot be reasonably imputed.
        - Document your reasons for the decisions on how to handle each of those.

In [13]:
round(zillow.isna().sum()/len(zillow)*100,2)

logerror                         0.00
transactiondate                  0.00
bathroomcnt                      0.00
bedroomcnt                       0.00
calculatedfinishedsquarefeet     0.16
latitude                         0.00
longitude                        0.00
lotsizesquarefeet                0.70
yearbuilt                        0.22
structuretaxvaluedollarcnt       0.16
taxvaluedollarcnt                0.00
landtaxvaluedollarcnt            0.00
taxamount                        0.01
parcelid                         0.00
fips                             0.00
regionidcity                     1.98
regionidcounty                   0.00
regionidzip                      0.05
regionidneighborhood            63.71
dtype: float64

In [14]:
# we're going to drop regionidneighborhood because it's missing over 50% of it's data

In [15]:
zillow = zillow.drop(columns=['regionidneighborhood'])

In [16]:
# we're going to check out regionidcity and see if that column is vital to our analysis

In [17]:
zillow.regionidcity.value_counts()

12447.0    11452
5534.0      1795
40227.0     1492
46298.0     1428
16764.0     1087
           ...  
32927.0        3
31134.0        2
21395.0        1
36078.0        1
10815.0        1
Name: regionidcity, Length: 175, dtype: int64

In [18]:
# I'm not sure what these codes are, so we're going to drop this 
#columns for now and if we need it later i'll impute
zillow = zillow.drop(columns=['regionidcity'])

In [19]:
# We're going to use the median to fill taxamount
zillow['taxamount'] = zillow['taxamount'].fillna(zillow.taxamount.median())

In [20]:
# fillna's with median for yearbuilt
zillow['yearbuilt'] = zillow['yearbuilt'].fillna(zillow.yearbuilt.median())


In [21]:
# fillna's with median for structuretaxvaluedollarcnt
zillow['structuretaxvaluedollarcnt'] = zillow['structuretaxvaluedollarcnt'].fillna(zillow.structuretaxvaluedollarcnt.median())

In [22]:
# fillna's with median for calculatedfinishedsquarefeet
zillow['calculatedfinishedsquarefeet'] = zillow['calculatedfinishedsquarefeet'].fillna(zillow.calculatedfinishedsquarefeet.median())

In [23]:
zillow['regionidzip'] = zillow['regionidzip'].fillna(zillow.regionidzip.median())

In [24]:
zillow.isna().sum()

logerror                          0
transactiondate                   0
bathroomcnt                       0
bedroomcnt                        0
calculatedfinishedsquarefeet      0
latitude                          0
longitude                         0
lotsizesquarefeet               369
yearbuilt                         0
structuretaxvaluedollarcnt        0
taxvaluedollarcnt                 1
landtaxvaluedollarcnt             1
taxamount                         0
parcelid                          0
fips                              0
regionidcounty                    0
regionidzip                       0
dtype: int64

In [25]:
# adding median value to those 2 missing rows in landtax and taxvalue & lotsizesquarefeet
zillow['taxvaluedollarcnt'] = zillow['taxvaluedollarcnt'].fillna(zillow.taxvaluedollarcnt.median())
zillow['landtaxvaluedollarcnt'] = zillow['landtaxvaluedollarcnt'].fillna(zillow.landtaxvaluedollarcnt.median())
zillow['lotsizesquarefeet'] = zillow['lotsizesquarefeet'].fillna(zillow.lotsizesquarefeet.median())

In [26]:
zillow.isna().sum()

logerror                        0
transactiondate                 0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
latitude                        0
longitude                       0
lotsizesquarefeet               0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
landtaxvaluedollarcnt           0
taxamount                       0
parcelid                        0
fips                            0
regionidcounty                  0
regionidzip                     0
dtype: int64

In [27]:
# turning our df into a csv a
zillow.to_csv('zillow.csv')

In [28]:
zillow = pd.read_csv('zillow.csv')

In [29]:
zillow.isna().sum()

Unnamed: 0                      0
logerror                        0
transactiondate                 0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
latitude                        0
longitude                       0
lotsizesquarefeet               0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
landtaxvaluedollarcnt           0
taxamount                       0
parcelid                        0
fips                            0
regionidcounty                  0
regionidzip                     0
dtype: int64

In [30]:
zillow = zillow.drop(columns=['Unnamed: 0'])

In [31]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52442 entries, 0 to 52441
Data columns (total 17 columns):
logerror                        52442 non-null float64
transactiondate                 52442 non-null object
bathroomcnt                     52442 non-null float64
bedroomcnt                      52442 non-null float64
calculatedfinishedsquarefeet    52442 non-null float64
latitude                        52442 non-null float64
longitude                       52442 non-null float64
lotsizesquarefeet               52442 non-null float64
yearbuilt                       52442 non-null float64
structuretaxvaluedollarcnt      52442 non-null float64
taxvaluedollarcnt               52442 non-null float64
landtaxvaluedollarcnt           52442 non-null float64
taxamount                       52442 non-null float64
parcelid                        52442 non-null int64
fips                            52442 non-null float64
regionidcounty                  52442 non-null float64
regionidzip   

In [42]:
# we're going to convert id's and zip codes into objects
cols = ['parcelid','fips', 'regionidcounty', 'regionidzip']
zillow = change_data(zillow, cols)

In [43]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52442 entries, 0 to 52441
Data columns (total 29 columns):
logerror                                 52442 non-null float64
transactiondate                          52442 non-null object
bathroomcnt                              52442 non-null float64
bedroomcnt                               52442 non-null float64
calculatedfinishedsquarefeet             52442 non-null float64
latitude                                 52442 non-null float64
longitude                                52442 non-null float64
lotsizesquarefeet                        52442 non-null float64
yearbuilt                                52442 non-null float64
structuretaxvaluedollarcnt               52442 non-null float64
taxvaluedollarcnt                        52442 non-null float64
landtaxvaluedollarcnt                    52442 non-null float64
taxamount                                52442 non-null float64
logerror_outliers                        52442 non-null floa

   4. Outliers: Original from exercises. Adapt as you see fit.

        - Write a function that accepts a series (i.e. one column from a data frame) and summarizes how many outliers are in the series. This function should accept a second parameter that determines how outliers are detected, with the ability to detect outliers in 3 ways: IQR, standard deviations (z-score), percentiles)

In [34]:
# converts df to series
def convert_to_series(df):
    '''
    helper function for the summarize function
    that converts a dataframe into a series and grabs 
    the value counts from that dataframe
    '''
    series = pd.Series([])
    for _, col in enumerate(df.columns.values):
        if df[col].dtype == 'object':
            col_count = df[col].value_counts()
        else:
            col_count = df[col].value_counts(bins=10)
        series = series.append(col_count)
    return series

In [35]:
# outlier detection with IQR as a filter
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

In [36]:
def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df    

In [37]:
add_upper_outlier_columns(zillow, 1.5)

Unnamed: 0,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,yearbuilt,structuretaxvaluedollarcnt,...,bedroomcnt_outliers,calculatedfinishedsquarefeet_outliers,latitude_outliers,longitude_outliers,lotsizesquarefeet_outliers,yearbuilt_outliers,structuretaxvaluedollarcnt_outliers,taxvaluedollarcnt_outliers,landtaxvaluedollarcnt_outliers,taxamount_outliers
0,0.025595,2017-01-01,3.5,4.0,3100.0,33634931.0,-117869207.0,4506.0,1998.0,485713.0,...,0.0,0.0,0.0,0,0.0,0,36307.75,0.0,0.0,0.0
1,0.055619,2017-01-01,1.0,2.0,1465.0,34449266.0,-119281531.0,12647.0,1967.0,88000.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0
2,0.005383,2017-01-01,2.0,3.0,1243.0,33886168.0,-117823170.0,8432.0,1962.0,85289.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0
3,-0.103410,2017-01-01,3.0,4.0,2376.0,34245180.0,-118240722.0,13038.0,1970.0,108918.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0
4,-0.001011,2017-01-01,3.0,4.0,2962.0,34145202.0,-118179824.0,63000.0,1950.0,276684.0,...,0.0,0.0,0.0,0,49488.5,0,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52437,0.020615,2017-09-20,2.0,2.0,1286.0,34245368.0,-118282383.0,47405.0,1940.0,70917.0,...,0.0,0.0,0.0,0,33893.5,0,0.00,0.0,0.0,0.0
52438,0.013209,2017-09-21,2.0,4.0,1612.0,34300140.0,-118706327.0,12105.0,1964.0,50683.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0
52439,0.037129,2017-09-21,1.0,3.0,1032.0,34040895.0,-118038169.0,5074.0,1954.0,32797.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0
52440,0.007204,2017-09-25,2.0,3.0,1762.0,33937685.0,-117996709.0,6347.0,1955.0,140000.0,...,0.0,0.0,0.0,0,0.0,0,0.00,0.0,0.0,0.0


In [38]:
outlier_cols = [col for col in zillow if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = zillow[col][zillow[col] > 0]
    print(data.describe())

~~~
logerror_outliers
count    3796.000000
mean        0.231999
std         0.402541
min         0.000022
25%         0.038266
50%         0.098133
75%         0.240631
max         5.124312
Name: logerror_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    1579.000000
mean        1.137112
std         0.969289
min         0.500000
25%         0.500000
50%         0.500000
75%         1.500000
max        13.500000
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    780.000000
mean       0.778205
std        0.753792
min        0.500000
25%        0.500000
50%        0.500000
75%        0.500000
max        8.500000
Name: bedroomcnt_outliers, dtype: float64
~~~
calculatedfinishedsquarefeet_outliers
count     2285.000000
mean      1184.944858
std       1487.354369
min          1.000000
25%        264.000000
50%        696.000000
75%       1530.000000
max      18071.000000
Name: calculatedfinishedsquarefeet_outliers, dtype: float64
~~~
latitude_outliers
count   

In [39]:
# we're going to drop the rows of houses worth over 1million dollars
zillow[zillow['structuretaxvaluedollarcnt'] > 1000000]

Unnamed: 0,logerror,transactiondate,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,yearbuilt,structuretaxvaluedollarcnt,...,bedroomcnt_outliers,calculatedfinishedsquarefeet_outliers,latitude_outliers,longitude_outliers,lotsizesquarefeet_outliers,yearbuilt_outliers,structuretaxvaluedollarcnt_outliers,taxvaluedollarcnt_outliers,landtaxvaluedollarcnt_outliers,taxamount_outliers
31,-0.259448,2017-01-02,5.0,4.0,5492.0,34144603.0,-117818513.0,67316.0,2006.0,1177804.0,...,0.0,1634.0,0.0,0,53804.5,0,728398.75,556411.875,0.00,5996.5775
45,0.063957,2017-01-02,6.5,5.0,9586.0,33496171.0,-117699380.0,30000.0,1992.0,1708622.0,...,0.0,5728.0,0.0,0,16488.5,0,1259216.75,2366849.875,1008366.75,24033.7975
188,0.097977,2017-01-03,6.0,5.0,5147.0,34162432.0,-118710724.0,13468.0,2003.0,1013212.0,...,0.0,1289.0,0.0,0,0.0,0,563806.75,273804.875,0.00,5194.9175
226,-0.104306,2017-01-03,5.0,6.0,6249.0,34064223.0,-118329480.0,22655.0,1924.0,1610255.0,...,0.5,2391.0,0.0,0,9143.5,0,1160849.75,2423731.875,1163615.75,30314.4475
319,0.045598,2017-01-04,9.0,8.0,8837.0,34141934.0,-118476940.0,56073.0,1951.0,2145317.0,...,2.5,4979.0,0.0,0,42561.5,0,1695911.75,2093096.875,297918.75,26229.9275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52173,0.070401,2017-09-18,4.0,3.0,3770.0,34035495.0,-118860949.0,8569.0,2004.0,1315849.0,...,0.0,0.0,0.0,0,0.0,0,866443.75,1012926.875,47216.75,14117.7175
52224,0.076615,2017-09-18,6.0,5.0,5945.0,34132565.0,-118384274.0,6840.0,2012.0,1872424.0,...,0.0,2087.0,0.0,0,0.0,0,1423018.75,1655066.875,132781.75,20695.3875
52262,0.078873,2017-09-19,2.0,3.0,1376.0,34035300.0,-118481564.0,6207.0,1923.0,1319825.0,...,0.0,0.0,0.0,0,0.0,0,870419.75,2092732.875,1123046.75,24236.8375
52314,0.078915,2017-09-19,6.0,5.0,5280.0,34041582.0,-118485838.0,8704.0,2006.0,1813000.0,...,0.0,1422.0,0.0,0,0.0,0,1363594.75,3551407.875,2088546.75,40988.5975


In [45]:
zillow.drop(zillow[zillow.structuretaxvaluedollarcnt >= 1_000_000].index, inplace=True)

In [48]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51670 entries, 0 to 52441
Data columns (total 29 columns):
logerror                                 51670 non-null float64
transactiondate                          51670 non-null object
bathroomcnt                              51670 non-null float64
bedroomcnt                               51670 non-null float64
calculatedfinishedsquarefeet             51670 non-null float64
latitude                                 51670 non-null float64
longitude                                51670 non-null float64
lotsizesquarefeet                        51670 non-null float64
yearbuilt                                51670 non-null float64
structuretaxvaluedollarcnt               51670 non-null float64
taxvaluedollarcnt                        51670 non-null float64
landtaxvaluedollarcnt                    51670 non-null float64
taxamount                                51670 non-null float64
logerror_outliers                        51670 non-null floa

   5. Use your function defined above to identify columns where you should handle the outliers.

   6. Write a function that accepts the zillow data frame and removes the outliers. You should make a decision and document how you will remove outliers.

   7. Is there erroneous data you have found that you need to remove or repair? If so, take action.

   8. Are there outliers you want to "squeeze in" to a max value? (e.g. all bathrooms > 6 => bathrooms = 6). If so, make those changes.

# Exploration with Clustering
## Cluster the Target Variable
    Why? By reducing the noise of the continuous variable, we can possibly see trends easier by turning this continuous variable into clusters and then comparing those clusters with respect to other variables through visualizations or tests.

    Perform clustering with logerror as the only feature used in the clustering algorithm. Decide on a number of clusters to use, and store the cluster predictions back onto your data frame as cluster_target. Look at the centroids that were produced in this process. What do they tell you?

    Use the produced clusters to help you explore through visualization how logerror relates to other variables. (A common way to do this is to use color to indicate the cluster id, and the other variables can be your x-axis and y-axis. (hint: look at your swarmplot function)).

## Cluster Independent Variables
   You should also perform some clustering based on a number of independent variables. Create and evaluate several clustering models based on subsets of the independent variables. Here are some ideas:

   - Location, that is, latitude and longitude
   - Size (finished square feet)
   - Location and size
   - Be sure to use these new clusters in exploring your data, and interpret what these clusters tell you.

## Test the Significance of Clusters
    Use statistical testing methods to determine whether the clusters you have created are significant in terms of their relationship to logerror.

# Modeling
## Feature Engineering
   1. Remove variables that are not needed, wanted, useful, or are redundant.
   2. Add any features you think may be useful.
   3. Split your data into training and test sets.
   4. Create subsets of data if you would like to create multiple models and then merge (such as, a different model for each cluster or for each county).

# Model Selection
   1. Train at least 3 different models (a model is different if there are changes in one or more of the following: features, hyper-parameters, algorithm). Create object, fit, predict & evaluate. Use mean absolute error or mean squared error to evaluate. Also, try regression algorithms you have not used before.
   2. Evaluate your best model on your test data set to get an idea of your model's out of sample error.