# Acquire

**Goal: Your goal is to predict the values of single unit properties using the obervations from 2017.**

**import**

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from sklearn.model_selection import train_test_split

sys.path.append("./util_")
import acquire_
import prepare_


**get data**

In [2]:
# sql query
query = """
SELECT bedroomcnt, 
		bathroomcnt,
        calculatedfinishedsquarefeet,
        taxvaluedollarcnt,
        yearbuilt,
        taxamount,
        fips
FROM properties_2017
WHERE propertylandusetypeid = 261 -- Single family home
"""

# UNION

# SELECT bedroomcnt, 
# 		bathroomcnt,
#         calculatedfinishedsquarefeet,
#         taxvaluedollarcnt,
#         yearbuilt,
#         taxamount,
#         fips
# FROM properties_2017
# WHERE propertylandusetypeid = 261 -- Single family home;
# """

In [3]:
# get data from codeup database
zillow, q = acquire_.get_codeup_sql_data_(db_name="zillow", query=query,fileName="zillow_single_family")

## Understand data

In [4]:
zillow.shape

(2152863, 7)

In [5]:
zillow.columns

Index(['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet',
       'taxvaluedollarcnt', 'yearbuilt', 'taxamount', 'fips'],
      dtype='object')

In [6]:
zillow.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
dtypes: float64(7)
memory usage: 115.0 MB


In [7]:
zillow.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
count,2152852.0,2152852.0,2144379.0,2152370.0,2143526.0,2148421.0,2152863.0
mean,3.287196,2.230688,1862.855,461896.2,1960.95,5634.866,6048.377
std,0.9547544,0.9992796,1222.125,699676.0,22.1622,8178.91,20.43329
min,0.0,0.0,1.0,1.0,1801.0,1.85,6037.0
25%,3.0,2.0,1257.0,188170.2,1949.0,2534.98,6037.0
50%,3.0,2.0,1623.0,327671.0,1958.0,4108.95,6037.0
75%,4.0,3.0,2208.0,534527.0,1976.0,6414.32,6059.0
max,25.0,32.0,952576.0,98428910.0,2016.0,1337756.0,6111.0


In [8]:
# count of numeric columns and object columns
numerics = len(zillow.select_dtypes("number").columns)
objects = len(zillow.select_dtypes("object").columns)

print("Numeric col count:", numerics)
print("object col count:", objects)

Numeric col count: 7
object col count: 0


**What I see:**

- I have 2152863 rows and 7 columns
- 7 of the 29 columns are numric while 0 of them are string object colums
- I have unsure null values in my data
- I also see the descriptive statistics of my data

# Prepare

In [9]:
zillow.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2152853,2152854,2152855,2152856,2152857,2152858,2152859,2152860,2152861,2152862
bedroomcnt,0.0,0.0,0.0,0.0,4.0,0.0,3.0,3.0,0.0,0.0,...,4.0,0.0,3.0,4.0,0.0,4.0,4.0,0.0,3.0,4.0
bathroomcnt,0.0,0.0,0.0,0.0,2.0,0.0,4.0,2.0,0.0,0.0,...,2.0,0.0,2.5,4.0,0.0,3.0,4.5,0.0,2.5,4.0
calculatedfinishedsquarefeet,,,,,3633.0,,1620.0,2077.0,,,...,1987.0,,1809.0,4375.0,,2262.0,3127.0,,1974.0,2110.0
taxvaluedollarcnt,27516.0,10.0,10.0,2108.0,296425.0,124.0,847770.0,646760.0,6730242.0,15532.0,...,259913.0,1198476.0,405547.0,422400.0,1087111.0,960756.0,536061.0,208057.0,424353.0,554009.0
yearbuilt,,,,,2005.0,,2011.0,1926.0,,,...,1955.0,,2012.0,2015.0,,2015.0,2014.0,,2015.0,2014.0
taxamount,,,,174.21,6941.39,,10244.94,7924.68,80348.13,248.89,...,3175.66,,4181.1,13877.56,19313.08,13494.52,6244.16,5783.88,5302.7,6761.2
fips,6037.0,6037.0,6037.0,6037.0,6037.0,6037.0,6037.0,6037.0,6037.0,6037.0,...,6059.0,6037.0,6059.0,6037.0,6059.0,6059.0,6059.0,6059.0,6059.0,6037.0


**Rename columns**

In [10]:
# rename dataframe columns
zillow = zillow.rename(columns={"bedroomcnt":"bedrooms",
                       "bathroomcnt":"bathrooms",
                       "calculatedfinishedsquarefeet":"sqr_feet",
                      "taxvaluedollarcnt":"tax_value",
                      "yearbuilt":"year_built",
                      "taxamount":"tax_amount",
                      "fips":"county"})

**drop nulls**

In [11]:
# drop all nulls in the dataframe
zillow = zillow.dropna()

**convert data type**

In [12]:
# convert data type from float to int
zillow.bedrooms = zillow.bedrooms.astype(int)
zillow.year_built = zillow.year_built.astype(int)

**Remove duplicated rows**

In [13]:
# remove the duplocated rows
zillow = zillow.drop_duplicates(keep="first")

**Remove outliers**

This is done using box plot in the explore phase. only looking at the training data.

In [14]:
# remove outliers
zillow = zillow[zillow.bedrooms <= 7]
zillow = zillow[zillow.bathrooms <= 7]
zillow = zillow[zillow.year_built >= 1900]
zillow = zillow[zillow.sqr_feet <= 5000]
zillow = zillow[zillow.tax_amount <= 20000]

**feature engineeing**

In [15]:
# Rename the unique values in fips to county names
zillow.county = zillow.county.astype(str).str.replace("6037.0","Los Angeles").str.replace("6059.0","Orange").str.replace("6111.0","Sam Juan")


## Split

In [16]:
# split the data into training, validation and testing sets
train, validate, test = prepare_.split_data_(df=zillow,
                    test_size=0.2, 
                     validate_size=0.2, 
                     random_state=95)
(train.shape, validate.shape, test.shape)

((1237902, 7), (412634, 7), (412635, 7))

**Save split**

In [17]:
prepare_.save_split_data(encoded_df=zillow, train=train, validate=validate, test=test)

'Four data sets saved as .csv'