# Acquire

**Goal: Your goal is to predict the values of single unit properties using the obervations from 2017.**

**import**

In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# data separation/transformation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# system manipulation
import sys
sys.path.append("./util_")
import acquire_
import prepare_

# other
import warnings
warnings.filterwarnings("ignore")


**get data**

In [2]:
# sql query
query = """
SELECT bedroomcnt, 
        bathroomcnt,
        calculatedfinishedsquarefeet,
        taxvaluedollarcnt,
        yearbuilt,
        taxamount,
        fips,
        latitude,
        longitude        
FROM properties_2017
WHERE propertylandusetypeid = 261; -- Single family home
"""

In [3]:
# get data from codeup database
zillow, q = acquire_.get_codeup_sql_data_(db_name="zillow", query=query,fileName="zillow_single_family")


**Understand data**

In [4]:
zillow.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,latitude,longitude
0,0.0,0.0,,27516.0,,,6037.0,34140430.0,-118625364.0
1,0.0,0.0,,10.0,,,6037.0,34585014.0,-118162010.0
2,0.0,0.0,,10.0,,,6037.0,34563376.0,-118019104.0
3,0.0,0.0,,2108.0,,174.21,6037.0,34526913.0,-118050581.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0,34560018.0,-118169806.0


In [5]:
zillow.shape

(2152863, 9)

In [6]:
zillow.columns

Index(['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet',
       'taxvaluedollarcnt', 'yearbuilt', 'taxamount', 'fips', 'latitude',
       'longitude'],
      dtype='object')

In [7]:
zillow.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 9 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
 7   latitude                      2152863 non-null  float64
 8   longitude                     2152863 non-null  float64
dtypes: float64(9)
memory usage: 147.8 MB


In [8]:
zillow.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,latitude,longitude
count,2152852.0,2152852.0,2144379.0,2152370.0,2143526.0,2148421.0,2152863.0,2152863.0,2152863.0
mean,3.287196,2.230688,1862.855,461896.2,1960.95,5634.866,6048.377,34007470.0,-118191200.0
std,0.9547544,0.9992796,1222.125,699676.0,22.1622,8178.91,20.43329,248165.4,343966.0
min,0.0,0.0,1.0,1.0,1801.0,1.85,6037.0,33339910.0,-119475800.0
25%,3.0,2.0,1257.0,188170.2,1949.0,2534.98,6037.0,33832130.0,-118384500.0
50%,3.0,2.0,1623.0,327671.0,1958.0,4108.95,6037.0,34004550.0,-118144700.0
75%,4.0,3.0,2208.0,534527.0,1976.0,6414.32,6059.0,34170470.0,-117944200.0
max,25.0,32.0,952576.0,98428910.0,2016.0,1337756.0,6111.0,34819650.0,-117554300.0


In [9]:
# count of numeric columns and object columns
numerics = len(zillow.select_dtypes("number").columns)
objects = len(zillow.select_dtypes("object").columns)

print("Numeric col count:", numerics)
print("object col count:", objects)

Numeric col count: 9
object col count: 0


**What I see:**

- I have 2152863 rows and 7 columns
- 7 of the 29 columns are numric while 0 of them are string object colums
- I have unsure null values in my data
- I also see the descriptive statistics of my data

# Prepare

**Rename columns**

In [10]:
# rename dataframe columns
zillow = zillow.rename(columns={"bedroomcnt":"bedrooms",
                       "bathroomcnt":"bathrooms",
                       "calculatedfinishedsquarefeet":"sqr_feet",
                      "taxvaluedollarcnt":"tax_value",
                      "yearbuilt":"year_built",
                      "taxamount":"tax_amount",
                      "fips":"county"})

**drop nulls**

In [11]:
# drop all nulls in the dataframe
zillow = zillow.dropna()

**convert data type**

In [12]:
# convert data type from float to int
zillow.bedrooms = zillow.bedrooms.astype(int)
zillow.year_built = zillow.year_built.astype(int)

**Remove duplicated rows**

In [13]:
# remove the duplocated rows
zillow = zillow.drop_duplicates(keep="first")

**Remove outliers**

This is done using box plot in the explore phase. only looking at the training data.

In [14]:
# remove outliers
zillow = zillow[zillow.bedrooms <= 7]
zillow = zillow[zillow.bathrooms <= 7]
zillow = zillow[zillow.year_built >= 1900]
zillow = zillow[zillow.sqr_feet <= 5000]
zillow = zillow[zillow.tax_amount <= 20000]

**feature transformation**

In [15]:
# Rename the unique values in fips to county names
zillow.county = zillow.county.astype(str).str.replace("6037.0","Los Angeles").str.replace("6059.0","Orange").str.replace("6111.0","Sam Juan")


**Create dummies**

In [16]:
# create dummie variables
dummies = pd.get_dummies(zillow.county)

# clean dummie column names
dummies_col = dummies.columns.str.replace(" ", "_").str.lower()

dummies.head(2)

Unnamed: 0,Los Angeles,Orange,Sam Juan
4,1,0,0
6,1,0,0


In [17]:
# make a copy of my original data frame
zillow_encoded_scaled = zillow.copy()

# add dummies to my data frame
zillow_encoded_scaled[dummies_col] = dummies
zillow_encoded_scaled.head(2)

Unnamed: 0,bedrooms,bathrooms,sqr_feet,tax_value,year_built,tax_amount,county,latitude,longitude,los_angeles,orange,sam_juan
4,4,2.0,3633.0,296425.0,2005,6941.39,Los Angeles,34560018.0,-118169806.0,1,0,0
6,3,4.0,1620.0,847770.0,2011,10244.94,Los Angeles,33996200.0,-118438000.0,1,0,0


## Split

In [18]:
# split the data into training, validation and testing sets
train, validate, test = prepare_.split_data_(df=zillow_encoded_scaled,
                    test_size=0.2, 
                     validate_size=0.2, 
                     random_state=95)
(train.shape, validate.shape, test.shape)

((1243601, 12), (414534, 12), (414534, 12))

**Scale data**

In [19]:
# scalable features
features_to_scale = train[['bedrooms','bathrooms','sqr_feet','year_built','tax_amount']]

features_to_scale.head(2)

Unnamed: 0,bedrooms,bathrooms,sqr_feet,year_built,tax_amount
1580543,3,2.0,1719.0,1976,5158.46
1776927,4,2.0,1729.0,1968,2901.78


In [20]:
# build a scaling object
scaler = MinMaxScaler()

# Note that we only call .fit with the training data,
# but we use .transform to apply the scaling to all the data splits.
# y Target values: this is not being changed in any way(no predictions are being made)
x_train_scaled = scaler.fit_transform(X=features_to_scale,)

# transfrom the validate and test using the minMax object
x_val_scaled = scaler.transform(X=validate[features_to_scale.columns])
x_test_scaled = scaler.transform(X=test[features_to_scale.columns])


In [21]:
x_train_scaled

array([[0.42857143, 0.28571429, 0.34366873, 0.65517241, 0.25742734],
       [0.57142857, 0.28571429, 0.34566913, 0.5862069 , 0.14451599],
       [0.42857143, 0.28571429, 0.33966793, 0.34482759, 0.02414904],
       ...,
       [0.42857143, 0.28571429, 0.31866373, 0.50862069, 0.06743569],
       [0.42857143, 0.28571429, 0.29445889, 0.6637931 , 0.21413868],
       [0.42857143, 0.28571429, 0.26505301, 0.53448276, 0.1410246 ]])

**Convert scaled features to dataframe**

In [22]:
# New _ariable mames to add to data
new_scale_col = []
for i in features_to_scale.columns:
    new_scale_col.append(f"{i}_scaled")

new_scale_col

['bedrooms_scaled',
 'bathrooms_scaled',
 'sqr_feet_scaled',
 'year_built_scaled',
 'tax_amount_scaled']

In [23]:
# convert to dataframe
x_train_scaled = pd.DataFrame(x_train_scaled)
x_val_scaled = pd.DataFrame(x_val_scaled)
x_test_scaled = pd.DataFrame(x_test_scaled)

# add new column names back to the data frame
x_train_scaled[new_scale_col] = x_train_scaled
x_val_scaled[new_scale_col] = x_val_scaled
x_test_scaled[new_scale_col] = x_test_scaled

# remove redundent columns from new dataframe
x_train_scaled = x_train_scaled[new_scale_col]
x_val_scaled = x_val_scaled[new_scale_col]
x_test_scaled = x_test_scaled[new_scale_col]

x_train_scaled.head()

Unnamed: 0,bedrooms_scaled,bathrooms_scaled,sqr_feet_scaled,year_built_scaled,tax_amount_scaled
0,0.428571,0.285714,0.343669,0.655172,0.257427
1,0.571429,0.285714,0.345669,0.586207,0.144516
2,0.428571,0.285714,0.339668,0.344828,0.024149
3,0.428571,0.285714,0.290258,0.482759,0.053821
4,0.571429,0.428571,0.687538,0.732759,0.456343


In [30]:
x_train_scaled.shape

(1243601, 5)

**Add scaled columns to a copy of original data**

In [24]:
train[x_train_scaled.columns] = x_train_scaled
validate[x_val_scaled.columns] = x_val_scaled
test[x_test_scaled.columns] = x_test_scaled

In [31]:
train.dropna()

Unnamed: 0,bedrooms,bathrooms,sqr_feet,tax_value,year_built,tax_amount,county,latitude,longitude,los_angeles,orange,sam_juan,bedrooms_scaled,bathrooms_scaled,sqr_feet_scaled,year_built_scaled,tax_amount_scaled
532476,3,2.0,1699.0,50343.0,1940,496.09,Los Angeles,33917890.0,-118197747.0,1,0,0,0.428571,0.285714,0.243449,0.465517,0.165679
1159344,4,2.0,2009.0,615802.0,1975,7255.22,Los Angeles,34143859.0,-117841236.0,1,0,0,0.428571,0.285714,0.294059,0.448276,0.130605
404423,4,2.0,1843.0,557492.0,1947,6704.29,Los Angeles,33776525.0,-118327552.0,1,0,0,0.285714,0.285714,0.322264,0.232759,0.251746
1150179,3,2.5,1794.0,524851.0,1986,6284.76,Orange,33881313.0,-117845895.0,0,1,0,0.571429,0.428571,0.468894,0.646552,0.106146
1049513,2,1.0,1307.0,437000.0,1941,5146.71,Los Angeles,33894911.0,-118132660.0,1,0,0,0.428571,0.357143,0.596919,0.750000,0.526721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502203,2,1.0,692.0,98606.0,1913,1414.88,Los Angeles,34063596.0,-117758469.0,1,0,0,0.285714,0.142857,0.212643,0.112069,0.111903
383192,3,1.0,1242.0,421328.0,1949,5396.98,Los Angeles,33921223.0,-118078152.0,1,0,0,0.571429,0.428571,0.597520,0.758621,0.315197
1088549,3,2.0,1594.0,102627.0,1959,1361.23,Los Angeles,34203318.0,-118563213.0,1,0,0,0.428571,0.285714,0.452691,0.525862,0.321837
1240079,3,2.0,1473.0,348674.0,1977,4293.28,Los Angeles,34257515.0,-118454988.0,1,0,0,0.571429,0.285714,0.250050,0.094828,0.120765


**Save split**

In [27]:
# prepare_.save_split_data(encoded_df=zillow, train=train, validate=validate, test=test)