# explore.ipynb

### When this section of the pipeline is done being coded, I will transfer it to ```explore.py```.

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# import necessary packages/modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import sqrt

from wrangle import get_zillow_data, prepare_zillow
from preprocessing import zillow_split

# default viz size settings
plt.rc('figure', figsize=(10, 8))
plt.rc('font', size=14)

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

In [2]:
df = get_zillow_data()
df.head()

Unnamed: 0,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,parcelid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,,,261.0,,,,,14297519,1727539,,...,0,0.03,2017-01-01,,,,,Single Family Residential,,
1,,,261.0,,,,,17052889,1387261,,...,1,0.06,2017-01-01,,,,,Single Family Residential,,
2,,,261.0,,,,,14186244,11677,,...,2,0.01,2017-01-01,,,,,Single Family Residential,,
3,,,261.0,2.0,,,,12177905,2288172,,...,3,-0.1,2017-01-01,,,,Central,Single Family Residential,,
4,,,266.0,2.0,,,1.0,10887214,1970746,,...,4,0.01,2017-01-01,Central,,,Central,Condominium,,


In [3]:
df.shape

(77580, 69)

In [4]:
df = prepare_zillow(df)
df.head()

Unnamed: 0_level_0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,regionidcounty,unitcnt,yearbuilt,taxvaluedollarcnt,logerror,propertylandusedesc
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
14297519,3.5,4.0,3100.0,6059.0,33634931.0,-117869207.0,1286.0,1.0,1998.0,1023282.0,0.03,Single Family Residential
17052889,1.0,2.0,1465.0,6111.0,34449266.0,-119281531.0,2061.0,1.0,1967.0,464000.0,0.06,Single Family Residential
14186244,2.0,3.0,1243.0,6059.0,33886168.0,-117823170.0,1286.0,1.0,1962.0,564778.0,0.01,Single Family Residential
12177905,3.0,4.0,2376.0,6037.0,34245180.0,-118240722.0,3101.0,1.0,1970.0,145143.0,-0.1,Single Family Residential
12095076,3.0,4.0,2962.0,6037.0,34145202.0,-118179824.0,3101.0,1.0,1950.0,773303.0,-0.0,Single Family Residential


In [5]:
df.columns

Index(['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'fips',
       'latitude', 'longitude', 'regionidcounty', 'unitcnt', 'yearbuilt',
       'taxvaluedollarcnt', 'logerror', 'propertylandusedesc'],
      dtype='object')

In [6]:
df.shape

(52412, 12)

In [7]:
df.isnull().sum()

bathroomcnt                       0
bedroomcnt                        0
calculatedfinishedsquarefeet     82
fips                              0
latitude                          0
longitude                         0
regionidcounty                    0
unitcnt                           0
yearbuilt                       116
taxvaluedollarcnt                 1
logerror                          0
propertylandusedesc               0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52412 entries, 14297519 to 13083743
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   52412 non-null  float64
 1   bedroomcnt                    52412 non-null  float64
 2   calculatedfinishedsquarefeet  52330 non-null  float64
 3   fips                          52412 non-null  float64
 4   latitude                      52412 non-null  float64
 5   longitude                     52412 non-null  float64
 6   regionidcounty                52412 non-null  float64
 7   unitcnt                       52412 non-null  float64
 8   yearbuilt                     52296 non-null  float64
 9   taxvaluedollarcnt             52411 non-null  float64
 10  logerror                      52412 non-null  float64
 11  propertylandusedesc           52412 non-null  object 
dtypes: float64(11), object(1)
memory usage: 5.2+ MB


### Impute the remaining nulls.

### Split the data.