## Acquire Zillow data and prepare Zillow data for clustering exercises

In [1]:
#imports
import numpy as np
import pandas as pd
import os
from env import get_db_url

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

### Functions from wrangle_zillow.py

In [3]:
from wrangle_zillow import acquire_zillow
from wrangle_zillow import missing_col_values
from wrangle_zillow import missing_row_values
from wrangle_zillow import single_unit_properties
from wrangle_zillow import data_prep

In [4]:
df = acquire_zillow()
df.shape

Reading from csv file...


  df = acquire_zillow()


(77381, 68)

In [5]:
missing_cols = missing_col_values(df)
missing_cols

Unnamed: 0,count,percent
id,0,0.0
parcelid,0,0.0
airconditioningtypeid,52428,0.677531
architecturalstyletypeid,77175,0.997338
basementsqft,77331,0.999354
bathroomcnt,0,0.0
bedroomcnt,0,0.0
buildingclasstypeid,77366,0.999806
buildingqualitytypeid,27709,0.358085
calculatedbathnbr,609,0.00787


In [6]:
missing_rows = missing_row_values(df)
missing_rows

Unnamed: 0,num_cols_missing,pct_cols_missing,num_cols
0,23,0.338235,2
1,24,0.352941,13
2,25,0.367647,24
3,26,0.382353,65
4,27,0.397059,316
5,28,0.411765,455
6,29,0.426471,5270
7,30,0.441176,3455
8,31,0.455882,9891
9,32,0.470588,12579


In [8]:
df = single_unit_properties(df)
remove_cols = ['buildingqualitytypeid', 'heatingorsystemtypeid', 'propertyzoningdesc', 'heatingorsystemdesc']
df = data_prep(df, cols_to_remove=remove_cols, prop_required_column=.5, prop_required_row=.75)
df.shape


(50973, 30)

In [9]:
df_cols = missing_col_values(df)
df_cols

Unnamed: 0,count,percent
id,0,0.0
parcelid,0,0.0
bathroomcnt,0,0.0
bedroomcnt,0,0.0
calculatedbathnbr,0,0.0
calculatedfinishedsquarefeet,0,0.0
finishedsquarefeet12,0,0.0
fips,0,0.0
fullbathcnt,0,0.0
latitude,0,0.0


### Acquire and Summarize

In [2]:
def acquire_zillow():
    '''
    This function checks for a copy of the dataset in the local directory 
    and pulls a new copy and saves it if there is not one,
    it then cleans the data by removing significant outliers then
    removing the rows with null values for 'yearbuilt'
    '''
    #assign the file name
    filename = 'zillow_clustering.csv'
    #check if the file exists in the current directory and read it if it is
    if os.path.exists(filename):
        print('Reading from csv file...')
        #read the local .csv into the notebook
        df = pd.read_csv(filename)
        return df
    #assign the sql query to a variable for use in pulling a new copy of the dataset from the database
    query = '''
    SELECT 
    prop_2017.*,
    log.logerror,
    log.transactiondate,
    airconditioningtype.airconditioningdesc,
    architecturalstyletype.architecturalstyledesc,
    buildingclasstype.buildingclassdesc,
    heatingorsystemtype.heatingorsystemdesc,
    propertylandusetype.propertylandusedesc,
    storytype.storydesc,
    typeconstructiontype.typeconstructiondesc
    FROM properties_2017 AS prop_2017
    JOIN (SELECT parcelid, MAX(transactiondate) AS max FROM predictions_2017 GROUP BY parcelid) AS pred_2017 USING(parcelid)
    LEFT JOIN (SELECT * FROM predictions_2017) AS log ON log.parcelid = pred_2017.parcelid AND log.transactiondate = pred_2017.max
    LEFT JOIN airconditioningtype USING(airconditioningtypeid) 
    LEFT JOIN architecturalstyletype USING(architecturalstyletypeid) 
    LEFT JOIN buildingclasstype USING(buildingclasstypeid) 
    LEFT JOIN heatingorsystemtype USING(heatingorsystemtypeid) 
    LEFT JOIN propertylandusetype USING(propertylandusetypeid) 
    LEFT JOIN storytype USING(storytypeid)
    LEFT JOIN typeconstructiontype USING(typeconstructiontypeid)
    WHERE prop_2017.latitude IS NOT NULL;
    '''
    #if needed pull a fresh copy of the dataset from the database
    print('Getting a fresh copy from SQL database...')
    df = pd.read_sql(query, get_db_url('zillow'))
    #save a copy of the dataset to the local directory as a .csv file
    df.to_csv(filename, index=False)
    return df

In [3]:
df = acquire_zillow()
df.head()

Reading from csv file...


  df = acquire_zillow()


Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [4]:
df.shape


(77381, 68)

In [5]:
df.columns


Index(['id', 'parcelid', 'airconditioningtypeid', 'architecturalstyletypeid',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardb

In [6]:
df.parcelid.value_counts()

14297519    1
12938199    1
11292200    1
11770136    1
17193656    1
           ..
17220161    1
12182567    1
11349008    1
12096766    1
13101116    1
Name: parcelid, Length: 77381, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77381 entries, 0 to 77380
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77381 non-null  int64  
 1   parcelid                      77381 non-null  int64  
 2   airconditioningtypeid         24953 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77381 non-null  float64
 6   bedroomcnt                    77381 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49672 non-null  float64
 9   calculatedbathnbr             76772 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6023 non-null   float64
 12  calculatedfinishedsquarefeet  77185 non-null  float64
 13  f

In [15]:
df.describe(include='all')

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
count,77381.0,77381.0,24953.0,206.0,50.0,77381.0,77381.0,15.0,49672.0,76772.0,614.0,6023.0,77185.0,73749.0,41.0,3009.0,6023.0,386.0,77381.0,8276.0,76772.0,25474.0,25474.0,1538.0,49440.0,77381.0,77381.0,69142.0,16137.0,867.0,464.0,1074.0,15042.0,77381.0,77381.0,50331,77381.0,75910.0,77381.0,30890.0,77331.0,77381.0,50.0,10096.0,222.0,50563.0,2388.0,70.0,77118.0,17560.0,172.0,77269.0,77380.0,77381.0,77379.0,77376.0,2886,2886.0,77137.0,77381.0,77381,24953,206,15,49440,77381,50,222
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,75.0,,1907,,,,,,,,,,,,,,,,,,,,,1,,,,265,5,5,2,10,13,1,4
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100.0,,LAR1,,,,,,,,,,,,,,,,,,,,,Y,,,,2017-06-30,Central,Contemporary,Buildings having wood or wood and steel frames,Central,Single Family Residential,Basement,Frame
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26714.0,,6747,,,,,,,,,,,,,,,,,,,,,2886,,,,1189,23133,172,14,33550,52320,50,219
mean,1495139.0,13007150.0,1.813289,7.38835,679.72,2.29913,3.053489,3.933333,6.534587,2.316867,66.0,1366.512535,1785.219939,1760.522543,1389.853659,2357.791625,1381.771542,2082.5,6048.819232,1.191155,2.250482,1.815459,350.111957,1.0,3.92051,34008370.0,-118203600.0,29969.28,1.0,517.916955,1.0,1.0,1.0,,261.82634,,60491290.0,33671.242708,2534.158333,187734.864066,96587.086822,1.476952,7.0,1.009608,6.040541,1.110021,305.411223,216.385714,1968.635831,1.434738,1.0,189314.5,490134.5,2016.0,301095.4,5995.532346,,14.087318,60496730000000.0,0.016625,,,,,,,,
std,860907.1,3481346.0,2.967894,2.734542,689.703546,0.996651,1.139096,0.258199,1.721953,0.979755,0.0,671.308125,954.049286,934.02434,123.531688,1188.77521,726.468878,1240.382784,20.748199,0.49108,0.966552,0.588345,261.681641,0.0,3.592779,265388.8,359482.6,123371.1,0.0,156.716017,0.0,0.0,0.0,,5.141199,,205905.2,47169.815982,801.593352,165118.79174,3798.347098,2.824386,0.0,0.11863,0.557285,1.171154,238.392793,190.177514,23.786032,0.544569,0.0,230087.4,653444.0,0.0,492596.0,7622.844153,,2.185663,1535242000000.0,0.170191,,,,,,,,
min,349.0,10711860.0,1.0,2.0,38.0,0.0,0.0,3.0,1.0,1.0,66.0,44.0,128.0,128.0,1056.0,598.0,44.0,380.0,6037.0,1.0,1.0,0.0,0.0,1.0,1.0,33339530.0,-119475400.0,236.0,1.0,24.0,1.0,1.0,1.0,,31.0,,60371010.0,3491.0,1286.0,6952.0,95982.0,0.0,7.0,1.0,4.0,1.0,11.0,12.0,1824.0,1.0,1.0,44.0,1000.0,2016.0,161.0,19.92,,3.0,60371010000000.0,-4.65542,,,,,,,,
25%,752070.0,11538300.0,1.0,7.0,273.0,2.0,2.0,4.0,6.0,2.0,66.0,955.0,1182.0,1172.0,1344.0,1625.0,956.0,993.75,6037.0,1.0,2.0,2.0,0.0,1.0,2.0,33814570.0,-118415300.0,5700.0,1.0,424.0,1.0,1.0,1.0,,261.0,,60373110.0,12447.0,1286.0,46736.0,96193.0,0.0,7.0,1.0,6.0,1.0,170.0,61.5,1953.0,1.0,1.0,84265.0,207000.0,2016.0,85504.0,2715.6325,,14.0,60373110000000.0,-0.024377,,,,,,,,
50%,1497932.0,12531570.0,1.0,7.0,515.0,2.0,3.0,4.0,6.0,2.0,66.0,1257.0,1543.0,1523.0,1440.0,2094.0,1259.0,1812.5,6037.0,1.0,2.0,2.0,436.0,1.0,2.0,34022000.0,-118180800.0,7206.0,1.0,500.0,1.0,1.0,1.0,,261.0,,60376030.0,25218.0,3101.0,118849.0,96389.0,0.0,7.0,1.0,6.0,1.0,251.0,164.5,1970.0,1.0,1.0,136499.0,358975.5,2016.0,203372.0,4450.695,,15.0,60376030000000.0,0.006627,,,,,,,,
75%,2240535.0,14211830.0,1.0,7.0,796.5,3.0,4.0,4.0,8.0,3.0,66.0,1615.0,2113.0,2076.0,1440.0,2838.0,1621.5,3053.5,6059.0,1.0,3.0,2.0,493.0,1.0,7.0,34174390.0,-117928400.0,11831.0,1.0,600.0,1.0,1.0,1.0,,266.0,,60590420.0,45457.0,3101.0,274765.0,96987.0,0.0,7.0,1.0,6.0,1.0,364.0,310.5,1987.0,2.0,1.0,218787.0,569001.5,2016.0,366796.5,6927.79,,15.0,60590420000000.0,0.039203,,,,,,,,


In [9]:
pd.DataFrame({'count' : df.isna().sum(), 'percent' : df.isna().mean()})

Unnamed: 0,count,percent
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,52428,0.677531
architecturalstyletypeid,77175,0.997338
basementsqft,77331,0.999354
...,...,...
buildingclassdesc,77366,0.999806
heatingorsystemdesc,27941,0.361083
propertylandusedesc,0,0.000000
storydesc,77331,0.999354


In [26]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
pd.concat([
    df.isna().sum(axis=1).rename('num_cols_missing'),
    df.isna().mean(axis=1).rename('percent_cols_missing'),
], axis=1).value_counts().to_frame(name='num_cols').sort_index()

In [None]:
pd.DataFrame(pd.concat([
    df.isna().sum(axis=1).rename('num_cols_missing'),
    df.isna().mean(axis=1).rename('pct_cols_missing'),
], axis=1).value_counts().to_frame(name='num_cols').sort_index().reset_index())

In [None]:
pd.concat([
    df.isna().sum(axis=1).rename('num_cols_missing'),
    df.isna().mean(axis=1).rename('pct_cols_missing'),
], axis=1).value_counts().to_frame(name='num_cols').sort_index().reset_index()

In [18]:
def missing_row_values(df):
    rows_df = pd.concat([
    df.isna().sum(axis=1).rename('num_cols_missing'),
    df.isna().mean(axis=1).rename('pct_cols_missing'),
    ], axis=1).value_counts().to_frame(name='num_cols').sort_index().reset_index()
    return rows_df


In [12]:
missing_rows = missing_row_values(df)
missing_rows

Unnamed: 0,num_cols_missing,pct_cols_missing,num_cols
0,23,0.338235,2
1,24,0.352941,13
2,25,0.367647,24
3,26,0.382353,65
4,27,0.397059,316
5,28,0.411765,455
6,29,0.426471,5270
7,30,0.441176,3455
8,31,0.455882,9891
9,32,0.470588,12579


In [17]:
def missing_col_values(df):
    cols_df = pd.DataFrame({'count' : df.isna().sum(), 'percent' : df.isna().mean()})
    return cols_df

In [14]:
df_cols = missing_col_values(df)
df_cols

Unnamed: 0,count,percent
id,0,0.0
parcelid,0,0.0
airconditioningtypeid,52428,0.677531
architecturalstyletypeid,77175,0.997338
basementsqft,77331,0.999354
bathroomcnt,0,0.0
bedroomcnt,0,0.0
buildingclasstypeid,77366,0.999806
buildingqualitytypeid,27709,0.358085
calculatedbathnbr,609,0.00787


---

### Preapre

In [6]:
type_values = [261.0, 263.0, 275.0, 265.0]
df = df[df.propertylandusetypeid.isin(type_values) == True]

unit_values = [2.0, 3.0]
df = df[df.unitcnt.isin(unit_values) == False]

def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

def data_prep(df, cols_to_remove=[], prop_required_column=.5, prop_required_row=.75):
    df = remove_columns(df, cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    df['unitcnt'].fillna(1, inplace=True)
    df['structuretaxvaluedollarcnt'].fillna(df.taxvaluedollarcnt - df.landtaxvaluedollarcnt, inplace=True)
    df = df.dropna()
    return df

df = data_prep(df, cols_to_remove=['buildingqualitytypeid', 'heatingorsystemtypeid', 'propertyzoningdesc', 'heatingorsystemdesc'], prop_required_column=.5, prop_required_row=.75)




In [17]:
df.propertylandusedesc.value_counts()

Single Family Residential                     52320
Condominium                                   19294
Duplex (2 Units, Any Combination)              2009
Planned Unit Development                       1944
Quadruplex (4 Units, Any Combination)           727
Triplex (3 Units, Any Combination)              535
Cluster Home                                    333
Mobile Home                                      74
Manufactured, Modular, Prefabricated Homes       58
Residential General                              37
Cooperative                                      29
Commercial/Office/Residential Mixed Used         15
Townhouse                                         6
Name: propertylandusedesc, dtype: int64

In [18]:
df.unitcnt.value_counts()

1.0      47293
2.0       2018
4.0        723
3.0        525
6.0          1
45.0         1
237.0        1
42.0         1
Name: unitcnt, dtype: int64

In [20]:
df.propertylandusetypeid.value_counts()

261.0    52320
266.0    19294
246.0     2009
269.0     1944
248.0      727
247.0      535
265.0      333
263.0       74
275.0       58
260.0       37
267.0       29
31.0        15
264.0        6
Name: propertylandusetypeid, dtype: int64

In [21]:
values = [261.0, 263.0, 275.0, 265.0]
df = df[df.propertylandusetypeid.isin(values) == True]

In [7]:
df.shape

(52785, 68)

In [8]:
unit_values = [2.0, 3.0]
df = df[df.unitcnt.isin(unit_values) == False]

In [10]:
def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

In [11]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [12]:
def data_prep(df, cols_to_remove=[], prop_required_column=.5, prop_required_row=.75):
    df = remove_columns(df, cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

In [13]:
df = data_prep(df, cols_to_remove=[], prop_required_column=.5, prop_required_row=.75)

In [14]:
df.shape

(52651, 34)

In [27]:
df.describe(include='all')

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
count,52651.0,52651.0,52651.0,52651.0,33682.0,52582.0,52642.0,52447.0,52651.0,52582.0,33834.0,52651.0,52651.0,52275.0,52651.0,52651.0,33773,52651.0,51626.0,52651.0,52631.0,52651.0,52651.0,52609.0,52582.0,52650.0,52651.0,52650.0,52647.0,52546.0,52651.0,52651,33834,52651
unique,,,,,,,,,,,,,,,29.0,,1288,,,,,,,,,,,,,,,259,9,4
top,,,,,,,,,,,,,,,100.0,,LAR1,,,,,,,,,,,,,,,2017-06-30,Central,Single Family Residential
freq,,,,,,,,,,,,,,,26688.0,,6473,,,,,,,,,,,,,,,818,20683,52214
mean,1497068.0,13017890.0,2.300004,3.298114,6.266641,2.302366,1917.956233,1920.457776,6049.553038,2.236678,3.965065,34024720.0,-118200100.0,11321.02,,261.038822,,60498630.0,33513.837253,2522.322007,96639.267238,1.917627,1.0,1963.574502,195365.1,525035.7,2016.0,329923.0,6395.985582,60506620000000.0,0.018239,,,
std,859593.6,3220218.0,1.01429,0.938881,1.715717,1.012138,998.232546,998.37108,21.581261,0.987682,2.562601,273965.9,362124.8,86531.65,,0.51064,,214199.3,48940.963236,801.704357,4595.751743,3.148704,0.0,23.082182,251533.5,714994.5,0.0,524989.1,8303.354761,1855669000000.0,0.175909,,,
min,349.0,10711860.0,0.0,0.0,1.0,1.0,128.0,128.0,6037.0,1.0,1.0,33340620.0,-119475400.0,236.0,,261.0,,60371010.0,3491.0,1286.0,95982.0,0.0,1.0,1878.0,129.0,1000.0,2016.0,161.0,49.18,60371010000000.0,-4.65542,,,
25%,757487.0,11513460.0,2.0,3.0,5.0,2.0,1265.25,1268.0,6037.0,2.0,2.0,33828280.0,-118405600.0,5539.0,,261.0,,60374010.0,12447.0,1286.0,96208.0,0.0,1.0,1950.0,77233.75,193473.5,2016.0,75871.5,2645.57,60374010000000.0,-0.024672,,,
50%,1500193.0,12593680.0,2.0,3.0,6.0,2.0,1653.5,1656.0,6037.0,2.0,2.0,34028430.0,-118156500.0,6820.0,,261.0,,60376210.0,24812.0,3101.0,96415.0,0.0,1.0,1961.0,131551.0,371363.0,2016.0,216441.0,4619.26,60376210000000.0,0.007015,,,
75%,2241716.0,14142070.0,3.0,4.0,8.0,3.0,2299.0,2301.0,6059.0,3.0,7.0,34189530.0,-117931000.0,8769.0,,261.0,,60590420.0,40227.0,3101.0,96996.0,5.0,1.0,1979.0,225000.0,615000.0,2016.0,406100.0,7336.605,60590420000000.0,0.040779,,,


In [28]:
df.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
0,1727539,14297519,3.5,4.0,,3.5,3100.0,3100.0,6059.0,3.0,,33634931.0,-117869207.0,4506.0,122,261.0,,60590630.0,53571.0,1286.0,96978.0,0.0,1.0,1998.0,485713.0,1023282.0,2016.0,537569.0,11013.72,60590630000000.0,0.025595,2017-01-01,,Single Family Residential
1,1387261,17052889,1.0,2.0,,1.0,1465.0,1465.0,6111.0,1.0,,34449266.0,-119281531.0,12647.0,1110,261.0,,61110010.0,13091.0,2061.0,97099.0,5.0,1.0,1967.0,88000.0,464000.0,2016.0,376000.0,5672.48,61110010000000.0,0.055619,2017-01-01,,Single Family Residential
2,11677,14186244,2.0,3.0,,2.0,1243.0,1243.0,6059.0,2.0,,33886168.0,-117823170.0,8432.0,122,261.0,,60590220.0,21412.0,1286.0,97078.0,6.0,1.0,1962.0,85289.0,564778.0,2016.0,479489.0,6488.3,60590220000000.0,0.005383,2017-01-01,,Single Family Residential
3,2288172,12177905,3.0,4.0,8.0,3.0,2376.0,2376.0,6037.0,3.0,2.0,34245180.0,-118240722.0,13038.0,101,261.0,LCR110000*,60373000.0,396551.0,3101.0,96330.0,0.0,1.0,1970.0,108918.0,145143.0,2016.0,36225.0,1777.51,60373000000000.0,-0.10341,2017-01-01,Central,Single Family Residential
6,781532,12095076,3.0,4.0,9.0,3.0,2962.0,2962.0,6037.0,3.0,2.0,34145202.0,-118179824.0,63000.0,101,261.0,PSR2,60374610.0,47019.0,3101.0,96293.0,0.0,1.0,1950.0,276684.0,773303.0,2016.0,496619.0,9516.26,60374610000000.0,-0.001011,2017-01-01,Central,Single Family Residential


In [31]:
df_cols = missing_col_values(df)
df_cols

Unnamed: 0,count,percent
id,0,0.0
parcelid,0,0.0
bathroomcnt,0,0.0
bedroomcnt,0,0.0
buildingqualitytypeid,18969,0.360278
calculatedbathnbr,69,0.001311
calculatedfinishedsquarefeet,9,0.000171
finishedsquarefeet12,204,0.003875
fips,0,0.0
fullbathcnt,69,0.001311


In [24]:
df.unitcnt.value_counts()

1.0    52651
Name: unitcnt, dtype: int64

In [22]:
df['unitcnt'].fillna(1, inplace=True)

In [25]:
df.heatingorsystemdesc.value_counts()

Central       20683
Floor/Wall    12517
Forced air      517
Solar            85
None             16
Baseboard         7
Radiant           6
Gravity           2
Yes               1
Name: heatingorsystemdesc, dtype: int64

In [30]:
df['structuretaxvaluedollarcnt'].fillna(df.taxvaluedollarcnt - df.landtaxvaluedollarcnt, inplace=True)

In [32]:
df.propertyzoningdesc.value_counts()

LAR1        6473
LARS        1380
LBR1N       1109
SCUR2        712
LARA         612
            ... 
GDE4*          1
WCRIAD1*       1
HPR4*          1
LR7OOO*        1
SDR175         1
Name: propertyzoningdesc, Length: 1288, dtype: int64

In [33]:
df.regionidcity.value_counts()

12447.0    11403
5534.0      1802
40227.0     1497
46298.0     1424
16764.0     1080
           ...  
32927.0        3
31134.0        2
21395.0        1
36078.0        1
10815.0        1
Name: regionidcity, Length: 175, dtype: int64

In [34]:
df.censustractandblock.value_counts()

6.037920e+13    32
6.037920e+13    27
6.059032e+13    24
6.037142e+13    24
6.059032e+13    23
                ..
6.037800e+13     1
6.037901e+13     1
6.037113e+13     1
6.059099e+13     1
6.037403e+13     1
Name: censustractandblock, Length: 31541, dtype: int64