In [289]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings("ignore")

import acquire
import summarize
import prepare

In [309]:
df = acquire.get_zillow_data()

In [310]:
df = df[df.propertylandusedesc == 'Single Family Residential']

In [311]:
# remove columns with > 99% missing and rows  > 40% missing
df = prepare.handle_missing_values(df, prop_required_column = .01, prop_required_row = .40)

In [312]:
# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa
# gather pool columns
pool_cols = ['hashottuborspa', 'poolcnt', 'poolsizesum', 'pooltypeid2', 'pooltypeid7']
# fill all missing values with 0
pool = df[pool_cols].fillna(0)
# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'
pool.loc[pool.sum(axis=1)>0, 'has_pool'] = 1
# append the new column to our original dataframe and remove the original pool columns 
df = df.join(pool[['has_pool']])

# fill with 0
df.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1
df.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1
df.loc[df.garagecarcnt > 0, 'has_garage'] = 1
fill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']
df[fill_with_0] = df[fill_with_0].fillna(0)

# remove columns where > 5% missing and rows where > 99% missing
df = prepare.handle_missing_values(df, prop_required_column = .95, prop_required_row = .99)

In [313]:
np.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt + df.structuretaxvaluedollarcnt)) != 0)


(array([], dtype=int64),)

In [314]:
df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / df.calculatedfinishedsquarefeet
df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt / df.lotsizesquarefeet
df['living_area_sqft'] = df.calculatedfinishedsquarefeet - (
    df.bedroomcnt * 121 + df.bathroomcnt * 36)
df['tax_rate'] = df.taxvaluedollarcnt / df.taxamount
df['bedbath_index'] = df.bedroomcnt * 2 + df.fullbathcnt + .5 * (
    df.bathroomcnt - df.fullbathcnt)

In [315]:
df['age'] = 2017 - df.yearbuilt

In [318]:
# create a reference table of the box coordinates

def box_coordinate_reference():
    data = {'box': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 
            'lat_max': [34.20554422, 34.04802211, 33.99908439, 33.80387687, 33.78696072, 33.69952829, 33.59712547, 
                        33.49972314, 34.38229689, 34.00526394, 34.06277594, 33.90355421, 33.80675031, 33.72661788, 
                        33.63681764], 
            'lat_min': [34.00187156, 33.97400195, 33.6995076, 33.69879119, 33.60475644, 33.60475644, 33.50224095, 
                        33.40473173, 34.08712558, 33.80359565, 33.89700744, 33.74602844, 33.64904627, 33.56876661, 
                        33.47880174], 
            'lon_max': [-119.1012469, -118.4853365, -118.3643017, -118.010889, -117.995256, -117.8587745, -117.7050516,
                        -117.595489, -117.1573461, -118.0072968, -116.7958861, -116.765738, -116.5865933, -116.4174318,
                        -116.2908127], 
            'lon_min': [-119.2668432, -118.5999006, -118.4982449, -118.5063201, -118.1009925, -118.1009925, -117.9472695, 
                        -117.837707, -118.9924736, -118.3552993, -118.0734651, -118.0401843, -117.8610396, -117.7330499, 
                        -117.6064309]}
    return pd.DataFrame(data)

In [322]:
def find_box_id(df):
    df2 = df[['latitude', 'longitude', 'logerror', 'parcelid']]
    
    # divide lat and lon by 1,000,000
    df2['latitude'] = df2.latitude / 1e6
    df2['longitude'] = df2.longitude / 1e6
    
    # set lat and lon to be dual indices
    df2 = df2.sort_values(['latitude', 'longitude']).set_index(['latitude', 'longitude'])

    # create an empty dataframe that all the box_ids will be appended to as we loop through the box coordinates 
    # to identify each parcel
    box_df = pd.DataFrame(columns = ['latitude', 'longitude', 'logerror', 'parcelid', 'box_id'])

    for i, row in ref_df.iterrows():
        box = df2.query('@row.lat_min <= latitude <= @row.lat_max and @row.lon_min <= longitude <= @row.lon_max')
        box['box_id'] = i + 1
        box = box.reset_index()
        box_df = box_df.append(box)
        
    # set index to be parcel id for joining the 2 tables. \
    df = df.set_index('parcelid')
    box_df = box_df.set_index('parcelid')
    box_df = box_df.drop(columns=['latitude', 'longitude', 'logerror'])

    # join the box id's to the original dataframe
    return df.join(box_df).reset_index()

In [323]:
ref_df = box_coordinate_reference()
df = find_box_id(df)