# Acquire

**Goal: Your goal is to predict the values of single unit properties using the obervations from 2017.**

**import**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from sklearn.model_selection import train_test_split

sys.path.append("./util_")
import acquire_
import prepare_

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

**get data**

In [2]:
# sql query
query = """
SELECT *
FROM properties_2017 AS prop
INNER JOIN predictions_2017 USING(parcelid)
LEFT JOIN airconditioningtype USING(airconditioningtypeid)
LEFT JOIN architecturalstyletype USING(architecturalstyletypeid)
LEFT JOIN buildingclasstype USING(buildingclasstypeid)
LEFT JOIN heatingorsystemtype USING(heatingorsystemtypeid)
LEFT JOIN propertylandusetype USING(propertylandusetypeid)
LEFT JOIN storytype USING(storytypeid)
LEFT JOIN typeconstructiontype USING(typeconstructiontypeid)
WHERE prop.unitcnt = 1;
"""

In [3]:
# get data from codeup database
zillow, q = acquire_.get_codeup_sql_data_(db_name="zillow", query=query,fileName="zillow_single_family")

## Understand data

In [4]:
zillow.shape

(47414, 69)

In [5]:
zillow.columns

Index(['typeconstructiontypeid', 'storytypeid', 'propertylandusetypeid',
       'heatingorsystemtypeid', 'buildingclasstypeid',
       'architecturalstyletypeid', 'airconditioningtypeid', 'parcelid', 'id',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid',
       'calculatedbathnbr', 'decktypeid', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
       'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'latitude',
       'longitude', 'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
       'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertyzoningdesc',
       'rawcensustractandblock', 'regionidcity', 'regionidcounty',
       'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr',
       'unitcnt', 'yardbuildingsqft17',

In [6]:
zillow.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47414 entries, 0 to 47413
Data columns (total 69 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   typeconstructiontypeid        0 non-null      object 
 1   storytypeid                   0 non-null      object 
 2   propertylandusetypeid         47414 non-null  float64
 3   heatingorsystemtypeid         46685 non-null  float64
 4   buildingclasstypeid           8 non-null      float64
 5   architecturalstyletypeid      0 non-null      object 
 6   airconditioningtypeid         22070 non-null  float64
 7   parcelid                      47414 non-null  int64  
 8   id                            47414 non-null  int64  
 9   basementsqft                  0 non-null      object 
 10  bathroomcnt                   47414 non-null  float64
 11  bedroomcnt                    47414 non-null  float64
 12  buildingqualitytypeid         46923 non-null  float64
 13  c

In [7]:
zillow.describe()

Unnamed: 0,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,airconditioningtypeid,parcelid,id,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,id.1,logerror
count,47414.0,46685.0,8.0,22070.0,47414.0,47414.0,47414.0,47414.0,46923.0,47368.0,...,20.0,47350.0,47414.0,47414.0,47414.0,47410.0,2210.0,47298.0,47414.0,47414.0
mean,262.5,3.46,3.88,1.0,11914053.44,1498375.9,2.24,2.95,6.63,2.24,...,1.35,187052.9,473141.27,2016.0,286340.85,5955.53,14.11,60383594664841.11,38900.23,0.01
std,3.94,2.39,0.35,0.0,3279810.25,860854.17,0.99,1.01,1.69,0.99,...,0.59,240719.11,688181.63,0.0,503762.33,8114.74,2.34,1943422866929.52,22573.53,0.16
min,31.0,2.0,3.0,1.0,10711855.0,1307.0,0.0,0.0,1.0,1.0,...,1.0,129.0,1000.0,2016.0,161.0,120.84,4.0,60371011101000.0,3.0,-4.66
25%,261.0,2.0,4.0,1.0,11178074.0,760320.75,2.0,2.0,6.0,2.0,...,1.0,83785.0,184850.75,2016.0,64802.0,2567.36,14.0,60372351002010.0,19292.25,-0.03
50%,261.0,2.0,4.0,1.0,11819691.5,1501084.0,2.0,3.0,7.0,2.0,...,1.0,134077.5,328229.5,2016.0,176778.5,4263.53,15.0,60374315013001.5,38818.5,0.01
75%,266.0,7.0,4.0,1.0,12511668.75,2245365.75,3.0,4.0,8.0,3.0,...,2.0,212207.0,533298.0,2016.0,336808.25,6670.62,15.0,60376023024017.0,58362.5,0.04
max,269.0,20.0,4.0,1.0,167688532.0,2982188.0,13.0,11.0,12.0,13.0,...,3.0,9164901.0,25381250.0,2016.0,22335500.0,290998.06,99.0,483030105084015.06,77613.0,5.26


In [8]:
# count of numeric columns and object columns
numerics = len(zillow.select_dtypes("number").columns)
objects = len(zillow.select_dtypes("object").columns)

print("Numeric col count:", numerics)
print("object col count:", objects)

Numeric col count: 41
object col count: 28


Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [9]:
# create a dataframe from all the null counts
zillow_null_status = pd.DataFrame(zillow.isna().sum(axis=0))
zillow_null_status = zillow_null_status.rename(columns={0:"num_rows_missing"})

# add percentage of th missing values to the new data frame
zillow_null_status["pct_rows_missing"] = zillow_null_status.num_rows_missing / zillow.shape[0]
zillow_null_status.head()

Unnamed: 0,num_rows_missing,pct_rows_missing
typeconstructiontypeid,47414,1.0
storytypeid,47414,1.0
propertylandusetypeid,0,0.0
heatingorsystemtypeid,729,0.02
buildingclasstypeid,47406,1.0


**What I see:**

- I have 2152863 rows and 7 columns
- 7 of the 29 columns are numric while 0 of them are string object colums
- I have unsure null values in my data
- I also see the descriptive statistics of my data

# Prepare

In [10]:
zillow.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47404,47405,47406,47407,47408,47409,47410,47411,47412,47413
typeconstructiontypeid,,,,,,,,,,,...,,,,,,,,,,
storytypeid,,,,,,,,,,,...,,,,,,,,,,
propertylandusetypeid,261.00,266.00,261.00,261.00,261.00,266.00,261.00,266.00,261.00,266.00,...,261.00,266.00,266.00,261.00,261.00,266.00,261.00,261.00,261.00,261.00
heatingorsystemtypeid,2.00,2.00,2.00,,2.00,2.00,2.00,2.00,7.00,2.00,...,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,7.00
buildingclasstypeid,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
buildingclassdesc,,,,,,,,,,,...,,,,,,,,,,
heatingorsystemdesc,Central,Central,Central,,Central,Central,Central,Central,Floor/Wall,Central,...,Central,Central,Central,Central,Central,Central,Central,Central,Central,Floor/Wall
propertylandusedesc,Single Family Residential,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Condominium,Single Family Residential,Condominium,Single Family Residential,Condominium,...,Single Family Residential,Condominium,Condominium,Single Family Residential,Single Family Residential,Condominium,Single Family Residential,Single Family Residential,Single Family Residential,Single Family Residential
storydesc,,,,,,,,,,,...,,,,,,,,,,


**Handle missing values**

Remove rows that are 75% emply and columns taht are 50% empty

In [11]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75, col_drop = []):
    # round the required columns times the length of the data frame to 0
    # remove every column that is 50% empty
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)

    # round the required rows times the length of the data frame to 0
    # remove every row that is 75% empty
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    
    # drop more specific columns
    df = df.drop(columns=col_drop)
    return df

In [12]:
# columns to drop
col_drop= ["propertylandusetypeid",
          "heatingorsystemtypeid",
           "buildingqualitytypeid",
           "calculatedfinishedsquarefeet",
           "calculatedbathnbr",
           ""
          "id"
          ]

zillow = handle_missing_values(df = zillow, col_drop = col_drop)
zillow.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,finishedsquarefeet12,fips,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
0,12177905,3.0,4.0,2376.0,6037.0,3.0,34245180.0,-118240722.0,13038.0,0101,...,108918.0,145143.0,2016.0,36225.0,1777.51,60373001001006.0,-0.1,2017-01-01,Central,Single Family Residential
1,10887214,3.0,3.0,1312.0,6037.0,3.0,34185120.0,-118414640.0,278581.0,010C,...,73681.0,119407.0,2016.0,45726.0,1533.89,60371236012000.0,0.01,2017-01-01,Central,Condominium
2,12095076,3.0,4.0,2962.0,6037.0,3.0,34145202.0,-118179824.0,63000.0,0101,...,276684.0,773303.0,2016.0,496619.0,9516.26,60374608001014.0,-0.0,2017-01-01,Central,Single Family Residential
3,12069064,1.0,2.0,738.0,6037.0,1.0,34149214.0,-118239357.0,4214.0,0100,...,18890.0,218552.0,2016.0,199662.0,2366.08,60373020041001.0,0.1,2017-01-01,,Single Family Residential
4,12790562,3.0,4.0,3039.0,6037.0,3.0,33960230.0,-118006914.0,20028.0,0100,...,177527.0,220583.0,2016.0,43056.0,3104.19,60375002024006.0,-0.04,2017-01-02,Central,Single Family Residential


**Rename columns**

In [13]:
# rename dataframe columns
zillow = zillow.rename(columns={
    "bedroomcnt":"bedrooms",
    "bathroomcnt":"bathrooms",
    "calculatedfinishedsquarefeet":"sqr_feet",
    "taxvaluedollarcnt":"tax_value",
    "yearbuilt":"year_built",
    "taxamount":"tax_amount",
    "fips":"county"
})


**convert data type**

In [14]:
# zillow.select_dtypes("float").drop(columns=["rawcensustractandblock","tax_amount","logerror", "tax_value","landtaxvaluedollarcnt","structuretaxvaluedollarcnt"])


In [15]:
# # convert data type from float to int
# zillow.bedrooms = zillow.bedrooms.astype(int)
# zillow.year_built = zillow.year_built.astype(int)

**Remove outliers**

This is done using box plot in the explore phase. only looking at the training data.

The IQR is major for determining outliers.

In [16]:
# add a new column to the data frame that indicates the outliers in the numeric columns.
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df

zillow_outliers = add_upper_outlier_columns(zillow, k=1.5)
zillow_outliers

Unnamed: 0,parcelid,bathrooms,bedrooms,finishedsquarefeet12,county,fullbathcnt,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,...,roomcnt_outliers,unitcnt_outliers,year_built_outliers,structuretaxvaluedollarcnt_outliers,tax_value_outliers,assessmentyear_outliers,landtaxvaluedollarcnt_outliers,tax_amount_outliers,censustractandblock_outliers,logerror_outliers
0,12177905,3.00,4.00,2376.00,6037.00,3.00,34245180.00,-118240722.00,13038.00,0101,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10887214,3.00,3.00,1312.00,6037.00,3.00,34185120.00,-118414640.00,278581.00,010C,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,12095076,3.00,4.00,2962.00,6037.00,3.00,34145202.00,-118179824.00,63000.00,0101,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,12069064,1.00,2.00,738.00,6037.00,1.00,34149214.00,-118239357.00,4214.00,0100,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,12790562,3.00,4.00,3039.00,6037.00,3.00,33960230.00,-118006914.00,20028.00,0100,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47409,10833991,3.00,3.00,1741.00,6037.00,3.00,34202400.00,-118502000.00,59487.00,010C,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
47410,11000655,2.00,2.00,1286.00,6037.00,2.00,34245368.00,-118282383.00,47405.00,0100,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
47411,12773139,1.00,3.00,1032.00,6037.00,1.00,34040895.00,-118038169.00,5074.00,0100,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
47412,12826780,2.00,3.00,1762.00,6037.00,2.00,33937685.00,-117996709.00,6347.00,0100,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [17]:
# see what the outliers in our data look like:
outlier_cols = [col for col in zillow_outliers if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = zillow_outliers[col][zillow_outliers[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count                  21.00
mean          145,959,960.70
std            32,850,903.88
min             2,586,871.12
25%           153,124,435.12
50%           153,124,979.12
75%           153,125,942.12
max           153,176,471.12
Name: parcelid_outliers, dtype: float64
~~~
bathrooms_outliers
count               1,283.00
mean                    1.07
std                     0.97
min                     0.50
25%                     0.50
50%                     0.50
75%                     1.50
max                     8.50
Name: bathrooms_outliers, dtype: float64
~~~
bedrooms_outliers
count                  30.00
mean                    1.47
std                     0.78
min                     1.00
25%                     1.00
50%                     1.00
75%                     2.00
max                     4.00
Name: bedrooms_outliers, dtype: float64
~~~
finishedsquarefeet12_outliers
count               2,740.00
mean                1,088.66
std                 1,43

In [18]:


# # remove outliers
# zillow = zillow[zillow.bedrooms <= 7]
# zillow = zillow[zillow.bathrooms <= 7]
# zillow = zillow[zillow.year_built >= 1900]
# zillow = zillow[zillow.sqr_feet <= 5000]
# zillow = zillow[zillow.tax_amount <= 20000]

**feature engineeing**

In [19]:
# Rename the unique values in fips to county names
zillow.county = zillow.county.astype(str).str.replace("6037.0","Los Angeles").str.replace("6059.0","Orange").str.replace("6111.0","Sam Juan")


## Split

In [22]:
# split the data into training, validation and testing sets
train, validate, test = prepare_.split_data_(df=zillow,
                    test_size=0.2, 
                     validate_size=0.2, 
                     random_state=95)
(train.shape, validate.shape, test.shape)

((28448, 53), (9483, 53), (9483, 53))

**Save split**

In [23]:
prepare_.save_split_data(encoded_df=zillow, train=train, validate=validate, test=test)

'Four data sets saved as .csv'