# Kwame's Zillow Zestimates Error Control

Table of contents with header links goes here.

## explore.ipynb

### When this section of the pipeline is done being coded, I will transfer internal code to ```explore.py``` and external code and output to the final notebook.

**I may make this the final notebook with a rename and cleanup.**

### Set up the environment

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# import necessary packages/modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import sqrt

from wrangle import get_zillow_data, prepare_zillow
from preprocessing import zillow_main_split, zillow_Xy_split, impute_nulls

# default viz size settings
plt.rc('figure', figsize=(10, 8))
plt.rc('font', size=14)

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

### Acquire the data

In [2]:
df = get_zillow_data()
df.shape

Unnamed: 0,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,,,261.0,,,,,14297519,1727539,,...,0,0.03,2017-01-01,,,,,Single Family Residential,,
1,,,261.0,,,,,17052889,1387261,,...,1,0.06,2017-01-01,,,,,Single Family Residential,,
2,,,261.0,,,,,14186244,11677,,...,2,0.01,2017-01-01,,,,,Single Family Residential,,
3,,,261.0,2.0,,,,12177905,2288172,,...,3,-0.1,2017-01-01,,,,Central,Single Family Residential,,
4,,,266.0,2.0,,,1.0,10887214,1970746,,...,4,0.01,2017-01-01,Central,,,Central,Condominium,,


### Tidy the data

In [4]:
df = prepare_zillow(df)
df.head()

Unnamed: 0_level_0,bathcnt,bedcnt,sqft,fips,latitude,longitude,county,yearbuilt,value,logerror
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
14297519,3.5,4.0,3100.0,6059.0,33634931.0,-117869207.0,1286.0,1998.0,1023282.0,0.03
17052889,1.0,2.0,1465.0,6111.0,34449266.0,-119281531.0,2061.0,1967.0,464000.0,0.06
14186244,2.0,3.0,1243.0,6059.0,33886168.0,-117823170.0,1286.0,1962.0,564778.0,0.01
12177905,3.0,4.0,2376.0,6037.0,34245180.0,-118240722.0,3101.0,1970.0,145143.0,-0.1
12095076,3.0,4.0,2962.0,6037.0,34145202.0,-118179824.0,3101.0,1950.0,773303.0,-0.0


### Summarize the data

In [5]:
df.columns

Index(['bathcnt', 'bedcnt', 'sqft', 'fips', 'latitude', 'longitude', 'county',
       'yearbuilt', 'value', 'logerror'],
      dtype='object')

In [6]:
df.shape

(52412, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52412 entries, 14297519 to 13083743
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   bathcnt    52412 non-null  float64
 1   bedcnt     52412 non-null  float64
 2   sqft       52330 non-null  float64
 3   fips       52412 non-null  float64
 4   latitude   52412 non-null  float64
 5   longitude  52412 non-null  float64
 6   county     52412 non-null  float64
 7   yearbuilt  52296 non-null  float64
 8   value      52411 non-null  float64
 9   logerror   52412 non-null  float64
dtypes: float64(10)
memory usage: 4.4 MB


In [8]:
df.isnull().sum()

bathcnt        0
bedcnt         0
sqft          82
fips           0
latitude       0
longitude      0
county         0
yearbuilt    116
value          1
logerror       0
dtype: int64

### Split the data into train, validate, test.

In [10]:
# main split
train, validate, test = zillow_main_split(df)

In [11]:
print(f'Shape of train data: {train.shape}')
print(f'Shape of validate data: {validate.shape}')
print(f'Shape of test data: {test.shape}')

Shape of train data: (29350, 10)
Shape of validate data: (12579, 10)
Shape of test data: (10483, 10)


### Impute the remaining nulls with medians.

In [12]:
def impute_nulls(df):
    # impute median for sqft
    df.sqft = df.sqft.fillna(df.sqft.median())
    # impute median for yearbuilt
    df.yearbuilt = df.yearbuilt.fillna(df.yearbuilt.median())
    # impute median for value
    df.value = df.value.fillna(df.value.median())
    return df

In [13]:
train = impute_nulls(train)
train.isnull().sum()

bathcnt      0
bedcnt       0
sqft         0
fips         0
latitude     0
longitude    0
county       0
yearbuilt    0
value        0
logerror     0
dtype: int64

In [14]:
validate = impute_nulls(validate)
validate.isnull().sum()

bathcnt      0
bedcnt       0
sqft         0
fips         0
latitude     0
longitude    0
county       0
yearbuilt    0
value        0
logerror     0
dtype: int64

In [15]:
test = impute_nulls(test)
test.isnull().sum()

bathcnt      0
bedcnt       0
sqft         0
fips         0
latitude     0
longitude    0
county       0
yearbuilt    0
value        0
logerror     0
dtype: int64

### Split the data into X and y train, X and y validate, X and y test.

In [16]:
X_train, X_validate, X_test, y_train, y_validate, y_test = zillow_Xy_split(train, validate, test)

In [19]:
# checking to make sure data was split correctly
print(f'Shape of X train data: {X_train.shape}')
print(f'Shape of X validate data: {X_validate.shape}')
print(f'Shape of X test data: {X_test.shape}')
print('\n')
print(f'Shape of y train data: {y_train.shape}')
print(f'Shape of y validate data: {y_validate.shape}')
print(f'Shape of y test data: {y_test.shape}')

Shape of X train data: (29350, 9)
Shape of X validate data: (12579, 9)
Shape of X test data: (10483, 9)


Shape of y train data: (29350,)
Shape of y validate data: (12579,)
Shape of y test data: (10483,)
